Corentin Salingue 12 lat temu
rodzic
commit
9cddae04db
66 zmienionych plików z 1215 dodań i 552 usunięć
  1. 4 4
      Makefile.am
  2. 19 17
      doc/doxygen/Makefile.am
  3. 2 2
      doc/doxygen/chapters/advanced_examples.doxy
  4. 20 7
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  5. 6 0
      doc/doxygen/chapters/api/cuda_extensions.doxy
  6. 27 1
      doc/doxygen/chapters/api/data_interfaces.doxy
  7. 1 1
      doc/doxygen/chapters/api/data_management.doxy
  8. 14 4
      doc/doxygen/chapters/api/data_out_of_core.doxy
  9. 5 0
      doc/doxygen/chapters/api/mic_extensions.doxy
  10. 6 0
      doc/doxygen/chapters/api/mpi.doxy
  11. 4 0
      doc/doxygen/chapters/api/multiformat_data_interface.doxy
  12. 10 0
      doc/doxygen/chapters/api/opencl_extensions.doxy
  13. 6 10
      doc/doxygen/chapters/api/performance_model.doxy
  14. 15 0
      doc/doxygen/chapters/api/profiling.doxy
  15. 5 0
      doc/doxygen/chapters/api/scc_extensions.doxy
  16. 10 0
      doc/doxygen/chapters/api/standard_memory_library.doxy
  17. 306 0
      doc/doxygen/chapters/api/threads.doxy
  18. 111 0
      doc/doxygen/chapters/api/toolbox.doxy
  19. 4 0
      doc/doxygen/chapters/api/workers.doxy
  20. 36 0
      doc/doxygen/chapters/environment_variables.doxy
  21. 4 1
      doc/doxygen/chapters/performance_feedback.doxy
  22. 2 1
      doc/doxygen/doxygen.cfg
  23. 2 0
      doc/doxygen/refman.tex
  24. 4 2
      examples/stencil/stencil-kernels.c
  25. 0 3
      include/starpu.h
  26. 0 2
      include/starpu_data.h
  27. 1 1
      include/starpu_data_interfaces.h
  28. 0 3
      include/starpu_disk.h
  29. 3 5
      include/starpu_perfmodel.h
  30. 1 0
      include/starpu_profiling.h
  31. 1 0
      include/starpu_scheduler.h
  32. 2 2
      include/starpu_thread_util.h
  33. 0 2
      include/starpu_util.h
  34. 59 66
      mpi/src/starpu_mpi.c
  35. 7 7
      sc_hypervisor/include/sc_hypervisor.h
  36. 6 6
      sc_hypervisor/include/sc_hypervisor_lp.h
  37. 1 1
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  38. 19 3
      sc_hypervisor/include/sc_hypervisor_policy.h
  39. 33 29
      sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c
  40. 28 28
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  41. 10 9
      sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
  42. 1 1
      sc_hypervisor/src/hypervisor_policies/idle_policy.c
  43. 25 23
      sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
  44. 3 3
      sc_hypervisor/src/hypervisor_policies/ispeed_policy.c
  45. 19 12
      sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c
  46. 3 3
      sc_hypervisor/src/policies_utils/lp_programs.c
  47. 88 76
      sc_hypervisor/src/policies_utils/lp_tools.c
  48. 86 103
      sc_hypervisor/src/policies_utils/policy_tools.c
  49. 49 28
      sc_hypervisor/src/policies_utils/speed.c
  50. 40 27
      sc_hypervisor/src/sc_hypervisor.c
  51. 2 2
      sc_hypervisor/src/sc_hypervisor_intern.h
  52. 14 0
      src/core/jobs.c
  53. 1 1
      src/core/perfmodel/perfmodel.c
  54. 1 5
      src/core/perfmodel/perfmodel.h
  55. 3 23
      src/core/perfmodel/perfmodel_bus.c
  56. 15 0
      src/core/sched_ctx.c
  57. 5 5
      src/core/workers.c
  58. 3 0
      src/core/workers.h
  59. 1 1
      src/datawizard/coherency.c
  60. 1 1
      src/datawizard/memalloc.c
  61. 1 1
      src/profiling/bound.c
  62. 29 15
      src/profiling/profiling.c
  63. 14 2
      src/sched_policies/eager_central_policy.c
  64. 13 0
      src/sched_policies/eager_central_priority_policy.c
  65. 3 3
      src/sched_policies/parallel_eager.c
  66. 1 0
      src/sched_policies/work_stealing_policy.c

+ 4 - 4
Makefile.am

@@ -99,14 +99,14 @@ all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
-	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top
+	$(RM) starpu-top/starpu_top.1 starpu-top/starpu_top$(EXEEXT)
 # TODO: resources
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
-	$(INSTALL_STRIP_PROGRAM) starpu-top/starpu_top $(DESTDIR)$(bindir)
+	-$(INSTALL_STRIP_PROGRAM) starpu-top/starpu_top$(EXEEXT) $(DESTDIR)$(bindir)
 uninstall-local:
-	$(RM) $(DESTDIR)$(bindir)/starpu_top
-	$(RM) starpu-top/starpu_top
+	$(RM) $(DESTDIR)$(bindir)/starpu_top$(EXEEXT)
+	$(RM) starpu-top/starpu_top$(EXEEXT)
 	$(RM) starpu-top/Makefile
 
 if STARPU_HAVE_HELP2MAN

+ 19 - 17
doc/doxygen/Makefile.am

@@ -93,44 +93,46 @@ chapters =	\
 	chapters/api/task_lists.doxy \
 	chapters/api/top.doxy \
 	chapters/api/versioning.doxy \
-	chapters/api/workers.doxy
+	chapters/api/workers.doxy \
+	chapters/api/threads.doxy \
+	chapters/api/toolbox.doxy
 
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	sed 's/#undef \(.*\)/#define \1 1/' $< > $@
 
 chapters/version.sty: $(chapters)
-	@for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
+	for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp
-	@if test -s timestamp ; then \
-		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
-		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
+	if test -s timestamp ; then \
+		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated ;\
+		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month ;\
 	fi
-	@if test -s timestamp_updated ; then \
+	if test -s timestamp_updated ; then \
 		echo "\newcommand{\STARPUUPDATED}{"`cat timestamp_updated`"}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	else \
 		echo "\newcommand{\STARPUUPDATED}{unknown date}" > $(top_srcdir)/doc/doxygen/chapters/version.sty;\
 	fi
-	@echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
-	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	echo "\newcommand{\STARPUVERSION}{$(VERSION)}" >> $(top_srcdir)/doc/doxygen/chapters/version.sty
+	for f in timestamp timestamp_updated timestamp_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 
 chapters/version.html: $(chapters)
-	@for f in $(chapters) ; do \
-                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f 2>/dev/null ; fi \
+	for f in $(chapters) ; do \
+                if test -f $(top_srcdir)/doc/doxygen/$$f ; then stat --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp
-	@if test -s timestamp ; then \
-		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated 2>/dev/null;\
-		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month 2>/dev/null;\
+	if test -s timestamp ; then \
+		LC_ALL=C date --date=@`cat timestamp` +"%d %B %Y" > timestamp_updated ;\
+		LC_ALL=C date --date=@`cat timestamp` +"%B %Y" > timestamp_updated_month ;\
 	fi
-	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
-	@if test -s timestamp_updated ; then \
+	echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
+	if test -s timestamp_updated ; then \
 		echo "Its contents was last updated on "`cat timestamp_updated`"." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	else \
 		echo "Its contents was last updated on <em>unknown_date</em>." >> $(top_srcdir)/doc/doxygen/chapters/version.html;\
 	fi
-	@for f in timestamp timestamp_updated timestamp_updated_month ; do \
+	for f in timestamp timestamp_updated timestamp_updated_month ; do \
 		if test -f $$f ; then $(RM) $$f ; fi ;\
 	done
 

+ 2 - 2
doc/doxygen/chapters/advanced_examples.doxy

@@ -399,11 +399,11 @@ the number of iterations in the base.
 StarPU will automatically determine when the performance model is calibrated,
 or rather, it will assume the performance model is calibrated until the
 application submits a task for which the performance can not be predicted. For
-::STARPU_HISTORY_BASED, StarPU will require 10 (::_STARPU_CALIBRATION_MINIMUM)
+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
 measurements for a given size before estimating that an average can be taken as
 estimation for further executions with the same size. For
 ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
-10 (::_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
 data size is smaller than 90% of the maximum measured data size (i.e. the
 measurement interval is large enough for a regression to have a meaning).
 Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment

+ 20 - 7
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -56,7 +56,6 @@ The task is waiting for a task.
 \ingroup API_Codelet_And_Tasks
 The task is waiting for some data.
 
-
 \def STARPU_CPU
 \ingroup API_Codelet_And_Tasks
 This macro is used when setting the field starpu_codelet::where
@@ -110,26 +109,40 @@ this macro indicates the codelet will have several implementations.
 The use of this macro is deprecated. One should always only define the
 field starpu_codelet::opencl_funcs.
 
-\def starpu_cpu_func_t
+\def STARPU_NMAXBUFS
+\ingroup API_Codelet_And_Tasks
+Defines the maximum number of buffers that tasks will be able to take
+as parameters. The default value is 8, it can be changed by using the
+configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
+\typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
 
-\def starpu_cuda_func_t
+\typedef starpu_cuda_func_t
 \ingroup API_Codelet_And_Tasks
 CUDA implementation of a codelet.
 
-\def starpu_opencl_func_t
+\typedef starpu_opencl_func_t
 \ingroup API_Codelet_And_Tasks
 OpenCL implementation of a codelet.
 
-\def starpu_mic_func_t
+\typedef starpu_mic_func_t
 \ingroup API_Codelet_And_Tasks
 MIC implementation of a codelet.
 
-\def starpu_scc_func_t
+\typedef starpu_scc_func_t
 \ingroup API_Codelet_And_Tasks
 SCC implementation of a codelet.
 
+\typedef starpu_mic_kernel_t
+\ingroup API_Codelet_And_Tasks
+MIC kernel for a codelet
+
+\typedef *starpu_scc_kernel_t
+\ingroup API_Codelet_And_Tasks
+SCC kernel for a codelet
+
 \struct starpu_codelet
 The codelet structure describes a kernel that is possibly
 implemented on various targets. For compatibility, make sure to
@@ -137,7 +150,7 @@ initialize the whole structure to zero, either by using explicit
 memset, or the function starpu_codelet_init(), or by letting the
 compiler implicitly do it in e.g. static storage case.
 \ingroup API_Codelet_And_Tasks
-\var starpu_codelet::where.
+\var starpu_codelet::where
 Optional field to indicate which types of processing units are able to
 execute the codelet. The different values ::STARPU_CPU, ::STARPU_CUDA,
 ::STARPU_OPENCL can be combined to specify on which types of processing

+ 6 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -14,6 +14,12 @@ This macro is defined when StarPU has been installed with CUDA
 support. It should be used in your code to detect the availability of
 CUDA as shown in \ref FullSourceCodeVectorScal.
 
+\def STARPU_MAXCUDADEVS
+\ingroup API_CUDA_Extensions
+This macro defines the maximum number of CUDA devices that are
+supported by StarPU.
+
+
 \fn cudaStream_t starpu_cuda_get_local_stream(void)
 \ingroup API_CUDA_Extensions
 This function gets the current worker’s CUDA stream. StarPU

+ 27 - 1
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -186,6 +186,32 @@ to manage asynchronicity. This must return -EAGAIN if any of the
 starpu_interface_copy() calls has returned -EAGAIN (i.e. at least some
 transfer is still ongoing), and return 0 otherwise.
 
+\enum starpu_data_interface_id
+\ingroup API_Data_Interfaces
+Identifier for all predefined StarPU data interfaces
+\var starpu_data_interface_id::STARPU_UNKNOWN_INTERFACE_ID
+Unknown interface
+\var starpu_data_interface_id::STARPU_MATRIX_INTERFACE_ID
+Identifier for the matrix data interface
+\var starpu_data_interface_id::STARPU_BLOCK_INTERFACE_ID
+Identifier for block data interface
+\var starpu_data_interface_id::STARPU_VECTOR_INTERFACE_ID
+Identifier for the vector data interface
+\var starpu_data_interface_id::STARPU_CSR_INTERFACE_ID
+Identifier for the csr data interface
+\var starpu_data_interface_id::STARPU_BCSR_INTERFACE_ID
+Identifier for the bcsr data interface
+\var starpu_data_interface_id::STARPU_VARIABLE_INTERFACE_ID
+Identifier for the variable data interface
+\var starpu_data_interface_id::STARPU_VOID_INTERFACE_ID
+Identifier for the void data interface
+\var starpu_data_interface_id::STARPU_MULTIFORMAT_INTERFACE_ID
+Identifier for the multiformat data interface
+\var starpu_data_interface_id::STARPU_COO_INTERFACE_ID
+Identifier for the coo data interface
+\var starpu_data_interface_id::STARPU_MAX_INTERFACE_ID
+Maximum number of data interfaces
+
 @name Registering Data
 \ingroup API_Data_Interfaces
 
@@ -726,7 +752,7 @@ addition to this.
 Return a pointer to the row pointer array of the matrix
 designated by \p interface.
 
-\def STARPU_CSR_GET_ROWPTR_DEV_HANDLE(interface)
+\def STARPU_BCSR_GET_ROWPTR_DEV_HANDLE(interface)
 \ingroup API_Data_Interfaces
 Return a device handle for the row pointer array of the matrix
 designated by \p interface. The offset documented below has to be used in

+ 1 - 1
doc/doxygen/chapters/api/data_management.doxy

@@ -251,7 +251,7 @@ This is the same as starpu_data_acquire_cb(), except that the
 data will be available on the given memory node instead of main
 memory.
 
-\int int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
+\fn int starpu_data_acquire_on_node_cb_sequential_consistency(starpu_data_handle_t handle, unsigned node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
 \ingroup API_Data_Management
 This is the same as starpu_data_acquire_cb_sequential_consistency(), except that the
 data will be available on the given memory node instead of main

+ 14 - 4
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -7,13 +7,23 @@
 
 /*! \defgroup API_Out_Of_Core Out Of Core
 
-
-
 \struct starpu_disk_ops
 \ingroup API_Out_Of_Core
 This is a set of functions to manipulate datas on disk.
-
-\fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size) 
+\var starpu_disk_ops::alloc
+\var starpu_disk_ops::free
+\var starpu_disk_ops::open
+open an existing file
+\var starpu_disk_ops::close
+\var starpu_disk_ops::read
+~= pread
+\var starpu_disk_ops::write
+\var starpu_disk_ops::plug
+\var starpu_disk_ops::unplug
+\var starpu_disk_ops::copy
+\var starpu_disk_ops::bandwidth
+
+\fn int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, size_t size)
 \ingroup API_Out_Of_Core
 Register a disk memory node with a set of functions to manipulate datas. <br />
 SUCCESS: return the disk node. <br />

+ 5 - 0
doc/doxygen/chapters/api/mic_extensions.doxy

@@ -13,6 +13,11 @@
 This macro is defined when StarPU has been installed with MIC support.
 It should be used in your code to detect the availability of MIC.
 
+\def STARPU_MAXMICDEVS
+\ingroup API_MIC_Extensions
+This macro defines the maximum number of MIC devices that are
+supported by StarPU.
+
 \typedef starpu_mic_func_symbol_t
 \ingroup API_MIC_Extensions
 Type for MIC function symbols

+ 6 - 0
doc/doxygen/chapters/api/mpi.doxy

@@ -11,6 +11,12 @@
 @name Initialisation
 \ingroup API_MPI_Support
 
+\def STARPU_USE_MPI
+\ingroup API_MPI_Support
+This macro is defined when StarPU has been installed with MPI
+support. It should be used in your code to detect the availability of
+MPI.
+
 \fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
 \ingroup API_MPI_Support
 Initializes the starpumpi library. \p initialize_mpi indicates if MPI

+ 4 - 0
doc/doxygen/chapters/api/multiformat_data_interface.doxy

@@ -64,6 +64,10 @@ returns the local pointer to the data with CUDA format.
 \ingroup API_Multiformat_Data_Interface
 returns the local pointer to the data with OpenCL format.
 
+\def STARPU_MULTIFORMAT_GET_MIC_PTR(interface)
+\ingroup API_Multiformat_Data_Interface
+returns the local pointer to the data with MIC format.
+
 \def STARPU_MULTIFORMAT_GET_NX(interface)
 \ingroup API_Multiformat_Data_Interface
 returns the number of elements in the data.

+ 10 - 0
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -14,6 +14,16 @@ This macro is defined when StarPU has been installed with
 OpenCL support. It should be used in your code to detect the
 availability of OpenCL as shown in \ref FullSourceCodeVectorScal.
 
+\def STARPU_MAXOPENCLDEVS
+\ingroup API_OpenCL_Extensions
+This macro defines the maximum number of OpenCL devices that are
+supported by StarPU.
+
+\def STARPU_OPENCL_DATADIR
+\ingroup API_OpenCL_Extensions
+This macro defines the directory in which the OpenCL codelets of the
+applications provided with StarPU have been installed.
+
 \struct starpu_opencl_program
 \ingroup API_OpenCL_Extensions
 Stores the OpenCL programs as compiled for the different OpenCL

+ 6 - 10
doc/doxygen/chapters/api/performance_model.doxy

@@ -260,20 +260,16 @@ existing set of measurements done in good conditions, that StarPU
 could benefit from instead of doing on-line measurements. And example
 of use can be seen in \ref PerformanceModelExample.
 
-\fn double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the bandwidth of data transfer between two memory nodes
 
-\fn double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the latency of data transfer between two memory nodes
 
-\fn double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
-\ingroup API_Performance_Mode
-Used to compute the execution time of tasks
-
-\fn double starpu_get_latency_CUDA_RAM(unsigned cudadev)
+\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the estimated time to transfer a given size between two memory nodes.
 
 */

+ 15 - 0
doc/doxygen/chapters/api/profiling.doxy

@@ -98,6 +98,16 @@ todo
 \var starpu_profiling_bus_info::transfer_count
         Number of transfers during profiling.
 
+\typedef STARPU_PROFILING_DISABLE
+\ingroup API_Profiling
+This value is used when calling the function
+starpu_profiling_status_set() to disable profiling.
+
+\typedef STARPU_PROFILING_ENABLE
+\ingroup API_Profiling
+This value is used when calling the function
+starpu_profiling_status_set() to enable profiling.
+
 \fn int starpu_profiling_status_set(int status)
 \ingroup API_Profiling
 This function sets the profiling status. Profiling is activated
@@ -114,6 +124,11 @@ previous status is returned.
 Return the current profiling status or a negative value in case
 there was an error.
 
+\fn int starpu_profiling_init(void)
+\ingroup API_Profiling
+This function resets performance counters and enable profiling if the
+environment variable \ref STARPU_PROFILING is set to a positive value.
+
 \fn void starpu_profiling_set_id(int new_id)
 \ingroup API_Profiling
 This function sets the ID used for profiling trace filename. It

+ 5 - 0
doc/doxygen/chapters/api/scc_extensions.doxy

@@ -13,6 +13,11 @@
 This macro is defined when StarPU has been installed with SCC support.
 It should be used in your code to detect the availability of SCC.
 
+\def STARPU_MAXSCCDEVS
+\ingroup API_SCC_Extensions
+This macro defines the maximum number of SCC devices that are
+supported by StarPU.
+
 \typedef starpu_scc_func_symbol_t
 \ingroup API_SCC_Extensions
 Type for SCC function symbols

+ 10 - 0
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -8,6 +8,16 @@
 
 /*! \defgroup API_Standard_Memory_Library Standard Memory Library
 
+\def starpu_data_malloc_pinned_if_possible
+\ingroup API_Standard_Memory_Library
+\deprecated
+Equivalent to starpu_malloc(). This macro is provided to avoid breaking old codes.
+
+\def starpu_data_free_pinned_if_possible
+\ingroup API_Standard_Memory_Library
+\deprecated
+Equivalent to starpu_free(). This macro is provided to avoid breaking old codes.
+
 \def STARPU_MALLOC_PINNED
 \ingroup API_Standard_Memory_Library
 Value passed to the function starpu_malloc_flags() to indicate the memory allocation should be pinned.

+ 306 - 0
doc/doxygen/chapters/api/threads.doxy

@@ -0,0 +1,306 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Threads Threads
+
+\brief This section describes the thread facilities provided
+by StarPU. The thread function are either implemented on top of the
+pthread library or the Simgrid library when the simulated performance
+mode is enabled (\ref SimulatedPerformance).
+
+\def STARPU_PTHREAD_CREATE_ON
+\ingroup API_Threads
+This macro calls the function starpu_pthread_create_on() and aborts on error.
+
+\def STARPU_PTHREAD_CREATE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_create() and aborts on error.
+
+\def STARPU_PTHREAD_MUTEX_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_MUTEX_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_MUTEX_LOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_lock() and aborts
+on error.
+
+\def STARPU_PTHREAD_MUTEX_UNLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_mutex_unlock() and aborts
+on error.
+
+\def STARPU_PTHREAD_KEY_CREATE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_key_create() and aborts
+on error.
+
+\def STARPU_PTHREAD_KEY_DELETE
+\ingroup API_Threads
+This macro calls the function starpu_pthread_key_delete() and aborts
+on error.
+
+\def STARPU_PTHREAD_SETSPECIFIC
+\ingroup API_Threads
+This macro calls the function starpu_pthread_setspecific() and aborts
+on error.
+
+\def STARPU_PTHREAD_GETSPECIFIC
+\ingroup API_Threads
+This macro calls the function starpu_pthread_getspecific() and aborts
+on error.
+
+\def STARPU_PTHREAD_RWLOCK_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_RWLOCK_RDLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_rdlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_WRLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_wrlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_UNLOCK
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_unlock() and
+aborts on error.
+
+\def STARPU_PTHREAD_RWLOCK_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_rwlock_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_COND_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_init() and aborts on error.
+
+\def STARPU_PTHREAD_COND_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_destroy() and aborts
+on error.
+
+\def STARPU_PTHREAD_COND_SIGNAL
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_signal() and aborts
+on error.
+
+\def STARPU_PTHREAD_COND_BROADCAST
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_broadcast() and
+aborts on error.
+
+\def STARPU_PTHREAD_COND_WAIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_cond_wait() and aborts on error.
+
+\def STARPU_PTHREAD_BARRIER_INIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_init() and aborts
+on error.
+
+\def STARPU_PTHREAD_BARRIER_DESTROY
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_destroy() and
+aborts on error.
+
+\def STARPU_PTHREAD_BARRIER_WAIT
+\ingroup API_Threads
+This macro calls the function starpu_pthread_barrier_wait() and aborts
+on error.
+
+\fn int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, int where)
+\ingroup API_Threads
+
+\fn int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg)
+\ingroup API_Threads
+This function starts a new thread in the calling process.  The new
+thread starts execution by invoking \p start_routine; \p arg is passed
+as the sole argument of \p start_routine.
+
+\fn int starpu_pthread_join(starpu_pthread_t thread, void **retval)
+\ingroup API_Threads
+This function waits for the thread specified by \p thread to
+terminate.  If that thread has already terminated, then the function
+returns immediately. The thread specified by \p thread must be
+joinable.
+
+\fn int starpu_pthread_attr_init(starpu_pthread_attr_t *attr)
+\ingroup API_Threads
+This function initializes the thread attributes object pointed to by
+\p attr with default attribute values.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr)
+\ingroup API_Threads
+This function destroys a thread attributes object which is no longer
+required. Destroying a thread attributes object has no effect on
+threads that were created using that object.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate)
+\ingroup API_Threads
+This function sets the detach state attribute of the thread attributes
+object referred to by \p attr to the value specified in \p
+detachstate.  The detach state attribute determines whether a thread
+created using the thread attributes object \p attr will be created in
+a joinable or a detached state.
+
+It does not do anything when the simulated performance mode is enabled
+(\ref SimulatedPerformance).
+
+\fn int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr)
+\ingroup API_Threads
+This function initializes the mutex object pointed to by \p mutex
+according to the mutex attributes specified in \p mutexattr.  If \p
+mutexattr is NULL, default attributes are used instead.
+
+\fn int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function destroys a mutex object, freeing the resources it might
+hold. The mutex must be unlocked on entrance.
+
+\fn int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function locks the given mutex. If the mutex is currently
+unlocked, it becomes locked and owned by the calling thread, and the
+function returns immediately. If the mutex is already locked by
+another thread, the function suspends the calling thread until the
+mutex is unlocked.
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function unlocks the given mutex. The mutex is assumed to be
+locked and owned by the calling thread on entrance to
+starpu_pthread_mutex_unlock().
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function behaves identically to starpu_pthread_mutex_lock(),
+except that it does not block the calling thread if the mutex is
+already locked by another thread (or by the calling thread in the case
+of a ``fast''  mutex). Instead, the function returns immediately with
+the error code EBUSY.
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\typedef STARPU_PTHREAD_MUTEX_INITIALIZER
+\ingroup API_Threads
+This macro initializes the mutex given in parameter.
+
+\fn int starpu_pthread_key_create(starpu_pthread_key_t *key, void (*destr_function) (void *))
+\ingroup API_Threads
+This function allocates a new TSD key. The key is stored in the
+location pointed to by \p key.
+
+\fn int starpu_pthread_key_delete(starpu_pthread_key_t key)
+\ingroup API_Threads
+This function deallocates a TSD key. It does not check whether
+non-NULL values are associated with that key in the currently
+executing threads, nor call the destructor function associated with
+the key.
+
+\fn int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
+\ingroup API_Threads
+This function changes the value associated with \p key in the calling
+thread, storing the given \p pointer instead.
+
+\fn  *starpu_pthread_getspecific(starpu_pthread_key_t key)
+\ingroup API_Threads
+This function returns the value associated with \p key on success, and
+NULL on error.
+
+\typedef STARPU_PTHREAD_COND_INITIALIZER
+\ingroup API_Threads
+This macro initializes the condition variable given in parameter.
+
+\fn starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr)
+\ingroup API_Threads
+This function initializes the condition variable \p cond, using the
+condition attributes specified in \p cond_attr, or default attributes
+if \p cond_attr is NULL.
+
+\fn starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function restarts one of the threads that are waiting on the
+condition variable \p cond. If no threads are waiting on \p cond,
+nothing happens. If several threads are waiting on \p cond, exactly
+one is restarted, but it not specified which.
+
+\fn starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function restarts all the threads that are waiting on the
+condition variable \p cond. Nothing happens if no threads are waiting on cond.
+
+\fn starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
+\ingroup API_Threads
+This function atomically unlocks the mutex (as per
+starpu_pthread_mutex_unlock()) and waits for the condition variable \p cond
+to be signaled. The thread execution is suspended and does not consume
+any CPU time until the condition variable is signaled. The mutex must
+be locked by the calling thread on entrance to
+starpu_pthread_cond_wait(). Before returning to the calling thread, the
+function re-acquires mutex (as per starpu_pthread_mutex_lock()).
+
+This function also produces trace when the configure option \ref
+enable-fxt-lock "--enable-fxt-lock" is enabled.
+
+\fn starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex, const struct timespec *abstime)
+\ingroup API_Threads
+This function atomically unlocks \p mutex and waits on \p cond, as
+starpu_pthread_cond_wait() does, but it also bounds the duration of
+the wait.
+
+\fn starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
+\ingroup API_Threads
+This function destroys a condition variable, freeing the resources it
+might hold. No threads must be waiting on the condition variable on
+entrance to the function.
+
+\fn starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_init().
+
+\fn starpu_pthread_rwlock_destroy(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_destroy().
+
+\fn starpu_pthread_rwlock_rdlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_lock().
+
+\fn starpu_pthread_rwlock_wrlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_lock().
+
+\fn starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
+\ingroup API_Threads
+This function is the same as starpu_pthread_mutex_unlock().
+
+*/

+ 111 - 0
doc/doxygen/chapters/api/toolbox.doxy

@@ -0,0 +1,111 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \defgroup API_Toolbox Toolbox
+
+\brief The following macros allow to make GCC extensions portable, and
+to have a code which can be compiled with any C compiler.
+
+\def STARPU_GNUC_PREREQ
+\ingroup API_Toolbox
+Return true (non-zero) if GCC version MAJ.MIN or later is being used (macro taken from glibc.)
+
+\def STARPU_UNLIKELY
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro allows programmers to mark an expression as unlikely.
+
+\def STARPU_LIKELY
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro allows programmers to mark an expression as likely.
+
+\def STARPU_ATTRIBUTE_UNUSED
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((unused))
+
+\def STARPU_ATTRIBUTE_INTERNAL
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((visibility ("internal")))
+
+\def STARPU_ATTRIBUTE_MALLOC
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((malloc))
+
+\def STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((warn_unused_result))
+
+\def STARPU_ATTRIBUTE_PURE
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to __attribute__((pure))
+
+\def STARPU_ATTRIBUTE_ALIGNED
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to__attribute__((aligned(size)))
+
+\def STARPU_WARN_UNUSED_RESULT
+\ingroup API_Toolbox
+When building with a GNU C Compiler, this macro is defined to__attribute__((__warn_unused_result__))
+
+\def STARPU_POISON_PTR
+\ingroup API_Toolbox
+This macro defines a value which can be used to mark pointers as
+invalid values.
+
+\def STARPU_MIN
+\ingroup API_Toolbox
+This macro returns the min of the two parameters.
+
+\def STARPU_MAX
+\ingroup API_Toolbox
+This macro returns the max of the two parameters.
+
+\def STARPU_ASSERT
+\ingroup API_Toolbox
+Unless StarPU has been configured with the option \ref enable-fast
+"--enable-fast", this macro will abort if the expression is false.
+
+\def STARPU_ASSERT_MSG
+\ingroup API_Toolbox
+Unless StarPU has been configured with the option \ref enable-fast
+"--enable-fast", this macro will abort if the expression is false. The
+given message will be displayed.
+
+\def STARPU_ABORT
+\ingroup API_Toolbox
+This macro aborts the program.
+
+\def STARPU_ABORT_MSG
+\ingroup API_Toolbox
+This macro aborts the program, and displays the given message.
+
+\def STARPU_CHECK_RETURN_VALUE
+\ingroup API_Toolbox
+If \p err has a value which is not 0, the given message is displayed
+before aborting.
+
+\def STARPU_CHECK_RETURN_VALUE_IS
+\ingroup API_Toolbox
+If \p err has a value which is not \p value, the given message is displayed
+before aborting.
+
+\def STARPU_RMB
+\ingroup API_Toolbox
+This macro can be used to do a synchronization.
+
+\def STARPU_WMB
+\ingroup API_Toolbox
+This macro can be used to do a synchronization.
+
+\fn int starpu_get_env_number(const char *str)
+\ingroup API_Toolbox
+If \p str is the name of a existing environment variable which is
+defined to an integer, the function returns the value of the integer.
+It returns 0 otherwise.
+
+*/
+

+ 4 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -8,6 +8,10 @@
 
 /*! \defgroup API_Workers_Properties Workers’ Properties
 
+\def STARPU_NMAXWORKERS
+\ingroup API_Workers_Properties
+Define the maximum number of workers managed by StarPU.
+
 \enum starpu_node_kind
 \ingroup API_Workers_Properties
 TODO

+ 36 - 0
doc/doxygen/chapters/environment_variables.doxy

@@ -70,6 +70,20 @@ STARPU_OPENCL_ONLY_ON_CPUS to 1, the OpenCL driver will ONLY enable
 CPU devices.
 </dd>
 
+<dt>STARPU_NMIC</dt>
+<dd>
+\anchor STARPU_NMIC
+\addindex __env__STARPU_NMIC
+MIC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
+<dt>STARPU_NSCC</dt>
+<dd>
+\anchor STARPU_NSCC
+\addindex __env__STARPU_NSCC
+SCC equivalent of the environment variable \ref STARPU_NCUDA.
+</dd>
+
 <dt>STARPU_WORKERS_NOBIND</dt>
 <dd>
 \anchor STARPU_WORKERS_NOBIND
@@ -136,6 +150,28 @@ starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
 is set.
 </dd>
 
+<dt>STARPU_WORKERS_MICID</dt>
+<dd>
+\anchor STARPU_WORKERS_MICID
+\addindex __env__STARPU_WORKERS_MICID
+MIC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_mic_deviceid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_WORKERS_SCCID</dt>
+<dd>
+\anchor STARPU_WORKERS_SCCID
+\addindex __env__STARPU_WORKERS_SCCID
+SCC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_scc_deviceid passed to starpu_init()
+is set.
+</dd>
+
 <dt>STARPU_SINGLE_COMBINED_WORKER</dt>
 <dd>
 \anchor STARPU_SINGLE_COMBINED_WORKER

+ 4 - 1
doc/doxygen/chapters/performance_feedback.doxy

@@ -35,7 +35,10 @@ call starpu_profiling_status_set() with the parameter
 is already enabled or not by calling starpu_profiling_status_get().
 Enabling monitoring also reinitialize all previously collected
 feedback. The environment variable \ref STARPU_PROFILING can also be
-set to <c>1</c> to achieve the same effect.
+set to <c>1</c> to achieve the same effect. The function
+starpu_profiling_init() can also be called during the execution to
+reinitialize performance counters and to start the profiling if the
+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
 
 Likewise, performance monitoring is stopped by calling
 starpu_profiling_status_set() with the parameter

+ 2 - 1
doc/doxygen/doxygen.cfg

@@ -639,7 +639,7 @@ CITE_BIB_FILES         =
 # The QUIET tag can be used to turn on/off the messages that are generated
 # by doxygen. Possible values are YES and NO. If left blank NO is used.
 
-QUIET                  = NO
+QUIET                  = YES
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
 # generated by doxygen. Possible values are YES and NO. If left blank
@@ -1622,6 +1622,7 @@ PREDEFINED             = STARPU_USE_OPENCL=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \
+			 STARPU_SIMGRID=1 \
                          __GCC__
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then

+ 2 - 0
doc/doxygen/refman.tex

@@ -193,6 +193,8 @@ Documentation License”.
 \input{group__API__Versioning}
 \input{group__API__Initialization__and__Termination}
 \input{group__API__Standard__Memory__Library}
+\input{group__API__Toolbox}
+\input{group__API__Threads}
 \input{group__API__Workers__Properties}
 \input{group__API__Data__Management}
 \input{group__API__Data__Interfaces}

+ 4 - 2
examples/stencil/stencil-kernels.c

@@ -18,6 +18,8 @@
 #include "stencil.h"
 #include <sys/time.h>
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
 #ifndef timersub
 #define	timersub(x, y, res) \
 	do \
@@ -382,9 +384,9 @@ void update_func_cpu(void *descr[], void *arg)
 	int workerid = starpu_worker_get_id();
 	DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
 	if (block->bz == 0)
-fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
+		FPRINTF(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 	else
-	DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
+		DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
 #ifdef STARPU_USE_MPI
 	int rank = 0;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

+ 0 - 3
include/starpu.h

@@ -141,13 +141,10 @@ int starpu_asynchronous_copy_disabled(void);
 int starpu_asynchronous_cuda_copy_disabled(void);
 int starpu_asynchronous_opencl_copy_disabled(void);
 
-void starpu_profiling_init();
 void starpu_display_stats();
 
 void starpu_get_version(int *major, int *minor, int *release);
 
-int starpu_worker_get_mp_nodeid(int id);
-
 #ifdef __cplusplus
 }
 #endif

+ 0 - 2
include/starpu_data.h

@@ -81,8 +81,6 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
 void starpu_data_display_memory_stats();
 
-/* XXX These macros are provided to avoid breaking old codes. But consider
- * these function names as deprecated. */
 #define starpu_data_malloc_pinned_if_possible	starpu_malloc
 #define starpu_data_free_pinned_if_possible	starpu_free
 

+ 1 - 1
include/starpu_data_interfaces.h

@@ -96,7 +96,7 @@ enum starpu_data_interface_id
 	STARPU_VOID_INTERFACE_ID=6,
 	STARPU_MULTIFORMAT_INTERFACE_ID=7,
 	STARPU_COO_INTERFACE_ID=8,
-	STARPU_MAX_INTERFACE_ID=9 /* maximum number of data interfaces */
+	STARPU_MAX_INTERFACE_ID=9
 };
 
 struct starpu_data_interface_ops

+ 0 - 3
include/starpu_disk.h

@@ -39,18 +39,15 @@ struct starpu_disk_ops {
 	 int 	(*full_write)   (unsigned node, void * base, void * obj, void * ptr, size_t size);
 };
 
-
 /* Posix functions to use disk memory */
 extern struct starpu_disk_ops starpu_disk_stdio_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_ops;
 extern struct starpu_disk_ops starpu_disk_unistd_o_direct_ops;
 
-/*functions to add an existing memory */
 void starpu_disk_close(unsigned node, void *obj, size_t size);
 
 void * starpu_disk_open(unsigned node, void *pos, size_t size);
 
-/* interface to create and to free a memory disk */
 int starpu_disk_register(struct starpu_disk_ops * func, void *parameter, size_t size);
 
 #endif /* __STARPU_DISK_H__ */

+ 3 - 5
include/starpu_perfmodel.h

@@ -169,11 +169,9 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
-double starpu_get_latency_RAM_CUDA(unsigned cudadev);
-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev);
-double starpu_get_latency_CUDA_RAM(unsigned cudadev);
-
+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
+double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 #ifdef __cplusplus
 }

+ 1 - 0
include/starpu_profiling.h

@@ -80,6 +80,7 @@ struct starpu_profiling_bus_info
 	int transfer_count;
 };
 
+void starpu_profiling_init();
 void starpu_profiling_set_id(int new_id);
 int starpu_profiling_status_set(int status);
 int starpu_profiling_status_get(void);

+ 1 - 0
include/starpu_scheduler.h

@@ -76,6 +76,7 @@ double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum star
 double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node);
 double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl);
 
+void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id);
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
include/starpu_thread_util.h

@@ -250,8 +250,8 @@
 	}                                                                      \
 } while (0)
 
-#define STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             \
-	int p_ret = pthread_barrier_wait(barrier);                             \
+#define STARPU_PTHREAD_BARRIER_WAIT(barrier) do {                             	\
+	int p_ret = pthread_barrier_wait((barrier));				\
 	if (STARPU_UNLIKELY(!((p_ret == 0) || (p_ret == PTHREAD_BARRIER_SERIAL_THREAD)))) { \
 		fprintf(stderr,                                                \
 			"%s:%d pthread_barrier_wait: %s\n",                    \

+ 0 - 2
include/starpu_util.h

@@ -30,8 +30,6 @@ extern "C"
 {
 #endif
 
-/* Return true (non-zero) if GCC version MAJ.MIN or later is being used
- * (macro taken from glibc.)  */
 #if defined __GNUC__ && defined __GNUC_MINOR__
 # define STARPU_GNUC_PREREQ(maj, min) \
 	((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))

+ 59 - 66
mpi/src/starpu_mpi.c

@@ -770,43 +770,44 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 			  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr,
 			  _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype, req->internal_req);
 
-	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
+	if (req->internal_req)
 	{
-		if (req->user_datatype == 1)
-		{
-			if (req->request_type == SEND_REQ)
-			{
-				// We need to make sure the communication for sending the size
-				// has completed, as MPI can re-order messages, let's call
-				// MPI_Wait to make sure data have been sent
-				ret = MPI_Wait(&req->size_req, MPI_STATUS_IGNORE);
-				STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %d", ret);
-
-			}
-			if (req->request_type == RECV_REQ)
-				// req->ptr is freed by starpu_data_unpack
-				starpu_data_unpack(req->data_handle, req->ptr, req->count);
-			else
-				free(req->ptr);
-		}
-		else
+		struct _starpu_mpi_copy_handle *chandle = find_chandle(starpu_data_get_tag(req->data_handle));
+		_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
+		delete_chandle(chandle);
+		free(chandle);
+	}
+	else
+	{
+		if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 		{
-			struct _starpu_mpi_copy_handle *chandle = find_chandle(starpu_data_get_tag(req->data_handle));
-			if (chandle && (req->data_handle != chandle->handle))
+			if (req->user_datatype == 1)
 			{
-				_STARPU_MPI_DEBUG(3, "Handling deleting of copy_handle structure from the hashmap..\n");
-				delete_chandle(chandle);
-				free(chandle);
+				if (req->request_type == SEND_REQ)
+				{
+					// We need to make sure the communication for sending the size
+					// has completed, as MPI can re-order messages, let's call
+					// MPI_Wait to make sure data have been sent
+					ret = MPI_Wait(&req->size_req, MPI_STATUS_IGNORE);
+					STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %d", ret);
+					free(req->ptr);
+				}
+				if (req->request_type == RECV_REQ)
+				{
+					// req->ptr is freed by starpu_data_unpack
+					starpu_data_unpack(req->data_handle, req->ptr, req->count);
+				}
 			}
 			else
 			{
-				_STARPU_MPI_DEBUG(3, "NOT deleting chandle %p from hashmap (tag %d %d)\n", chandle, req->mpi_tag, starpu_data_get_tag(req->data_handle));
 				_starpu_mpi_handle_free_datatype(req->data_handle, &req->datatype);
 			}
 		}
-		starpu_data_release(req->data_handle);
 	}
 
+	if (req->data_handle)
+		starpu_data_release(req->data_handle);
+
 	if (req->envelope)
 	{
 		free(req->envelope);
@@ -908,61 +909,53 @@ static void _starpu_mpi_submit_new_mpi_request(void *arg)
 			_STARPU_MPI_DEBUG(3, "Calling data_acquire_cb on starpu_mpi_copy_cb..\n");
 			starpu_data_acquire_cb(chandle->handle,STARPU_R,_starpu_mpi_copy_cb,(void*) cb_args);
 		}
-		else
+		/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
+		 * incoming data without a matching pending receive already submitted by the application.
+		 * We immediately allocate the pointer associated to the data_handle, and pushing it into
+		 * the list of new_requests, so as the real MPI request can be submitted before the next
+		 * submission of the envelope-catching request. */
+		else if (chandle && (req->data_handle == chandle->handle))
 		{
-			/* Case : the request is the internal receive request submitted by StarPU-MPI to receive
-			 * incoming data without a matching pending receive already submitted by the application.
-			 * We immediately allocate the pointer associated to the data_handle, and pushing it into
-			 * the list of new_requests, so as the real MPI request can be submitted before the next
-			 * submission of the envelope-catching request. */
-			if (chandle && (req->data_handle == chandle->handle))
+			_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
+			if (req->user_datatype == 0)
 			{
-				_starpu_mpi_handle_allocate_datatype(req->data_handle, &req->datatype, &req->user_datatype);
-				if (req->user_datatype == 0)
-				{
-					req->count = 1;
-					req->ptr = starpu_data_get_local_ptr(req->data_handle);
-				}
-				else
-				{
-					req->count = chandle->env->psize;
-					req->ptr = malloc(req->count);
-
-					STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
-				}
-
-				_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-				_starpu_mpi_req_list_push_front(new_requests, req);
-
-				/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
-				STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
-				req->posted = 1;
-				STARPU_PTHREAD_COND_BROADCAST(&req->posted_cond);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&req->posted_mutex);
-				STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+				req->count = 1;
+				req->ptr = starpu_data_get_local_ptr(req->data_handle);
 			}
-			/* Case : a classic receive request with no send received earlier than expected.
-			 * We just add the pending receive request to the requests' hashmap. */
 			else
 			{
-				add_req(req);
+				req->count = chandle->env->psize;
+				req->ptr = malloc(req->count);
+				STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld\n", req->count);
 			}
 
-			newer_requests = 1;
-			STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
+			_STARPU_MPI_DEBUG(3, "Pushing internal starpu_mpi_irecv request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
+			_starpu_mpi_req_list_push_front(new_requests, req);
+
+			/* inform the starpu mpi thread that the request has beenbe pushed in the new_requests list */
+			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+			STARPU_PTHREAD_MUTEX_LOCK(&req->posted_mutex);
+			req->posted = 1;
+			STARPU_PTHREAD_COND_BROADCAST(&req->posted_cond);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&req->posted_mutex);
+			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		}
+		/* Case : a classic receive request with no send received earlier than expected.
+		 * We just add the pending receive request to the requests' hashmap. */
+		else
+		{
+			add_req(req);
 		}
 	}
 	else
 	{
 		_starpu_mpi_req_list_push_front(new_requests, req);
-
-		newer_requests = 1;
 		_STARPU_MPI_DEBUG(3, "Pushing new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d user_datatype %d \n",
 				  req, _starpu_mpi_request_type(req->request_type), req->mpi_tag, req->srcdst, req->data_handle, req->ptr, _starpu_mpi_datatype(req->datatype), (int)req->count, req->user_datatype);
-		STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	}
 
+	newer_requests = 1;
+	STARPU_PTHREAD_COND_BROADCAST(&cond_progression);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	_STARPU_MPI_LOG_OUT();
 }
@@ -1311,7 +1304,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_ASSERT_MSG(_starpu_mpi_req_list_empty(new_requests), "List of new requests not empty");
 	STARPU_ASSERT_MSG(posted_requests == 0, "Number of posted request is not zero");
 	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_req_hashmap) == 0, "Number of receive requests left is not zero");
-
+	STARPU_ASSERT_MSG(HASH_COUNT(_starpu_mpi_copy_handle_hashmap) == 0, "Number of copy requests left is not zero");
 	if (argc_argv->initialize_mpi)
 	{
 		_STARPU_MPI_DEBUG(3, "Calling MPI_Finalize()\n");

+ 7 - 7
sc_hypervisor/include/sc_hypervisor.h

@@ -47,10 +47,10 @@ struct sc_hypervisor_policy
 	unsigned custom;
 
 	/* Distribute workers to contexts even at the begining of the program */
-	void (*size_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+	void (*size_ctxs)(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
 
 	/* Require explicit resizing */
-	void (*resize_ctxs)(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+	void (*resize_ctxs)(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
 
 	/* the hypervisor takes a decision when the worker was idle for another cyle in this ctx */
 	void (*handle_idle_cycle)(unsigned sched_ctx, int worker);
@@ -90,7 +90,7 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx);
 void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag);
 
 /* reevaluate the distribution of the resources and eventually resize if needed */
-void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
+void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers);
 
 /* don't allow the hypervisor to resize a context */
 void sc_hypervisor_stop_resize(unsigned sched_ctx);
@@ -111,13 +111,13 @@ void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigne
 void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int *workers_to_move, unsigned nworkers_to_move, unsigned now);
 
 /* ask the hypervisor to chose a distribution of workers in the required contexts */
-void sc_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
+void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
 
 /* check if there are pending demands of resizing */
-unsigned sc_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers);
+unsigned sc_hypervisor_get_size_req(unsigned **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers);
 
 /* save a demand of resizing */
-void sc_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
+void sc_hypervisor_save_size_req(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers);
 
 /* clear the list of pending demands of resizing */
 void sc_hypervisor_free_size_req(void);
@@ -126,7 +126,7 @@ void sc_hypervisor_free_size_req(void);
 unsigned sc_hypervisor_can_resize(unsigned sched_ctx);
 
 /* indicate the types of tasks a context will execute in order to better decide the sizing of ctxs */
-	void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size);
+void sc_hypervisor_set_type_of_task(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, size_t data_size);
 
 #ifdef __cplusplus
 }

+ 6 - 6
sc_hypervisor/include/sc_hypervisor_lp.h

@@ -33,9 +33,9 @@ extern "C"
 #endif //STARPU_HAVE_GLPK_H
 
 struct sc_hypervisor_policy_task_pool; 
-
+struct types_of_workers;
 /* returns tmax, and computes in table res the nr of workers needed by each context st the system ends up in the smallest tmax*/
-double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers]);
+double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers], struct types_of_workers *tw);
 
 /* returns tmax of the system */
 double sc_hypervisor_lp_get_tmax(int nw, int *workers);
@@ -44,13 +44,13 @@ double sc_hypervisor_lp_get_tmax(int nw, int *workers);
 void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw]);
 
 /* redistribute the ressource in contexts by assigning the first x available ressources to each one */
-void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *sched_ctxs);
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], unsigned *sched_ctxs, struct types_of_workers *tw);
 
 /* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
-void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers);
+void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers, struct types_of_workers *tw);
 
 /* place resources in contexts dependig on whether they already have workers or not */
-void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs, int *workers, unsigned do_size);
+void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], unsigned *sched_ctxs, int *workers, unsigned do_size, struct types_of_workers *tw);
 
 /* dichotomy btw t1 & t2 */
 double sc_hypervisor_lp_find_tmax(double t1, double t2);
@@ -69,7 +69,7 @@ double sc_hypervisor_lp_simulate_distrib_flops(int nsched_ctxs, int ntypes_of_wo
 
 /* linear program that simulates a distribution of tasks that minimises the execution time of the tasks in the pool */
 double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt],
-					       double times[nw][nt], unsigned is_integer, double tmax, int *in_sched_ctxs,
+					       double times[nw][nt], unsigned is_integer, double tmax, unsigned *in_sched_ctxs,
 					       struct sc_hypervisor_policy_task_pool *tmp_task_pools);
 
 #endif // STARPU_HAVE_GLPK_H

+ 1 - 1
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -105,7 +105,7 @@ struct sc_hypervisor_wrapper
 struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
 
 /* get the list of registered contexts */
-int *sc_hypervisor_get_sched_ctxs();
+unsigned *sc_hypervisor_get_sched_ctxs();
 
 /* get the number of registered contexts */
 int sc_hypervisor_get_nsched_ctxs();

+ 19 - 3
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -31,6 +31,13 @@ extern "C"
 #define SC_IDLE 1
 #define SC_SPEED 2
 
+struct types_of_workers
+{
+	unsigned ncpus;
+	unsigned ncuda;
+	unsigned nw;
+};
+
 struct sc_hypervisor_policy_task_pool
 {
 	struct starpu_codelet *cl;
@@ -42,7 +49,7 @@ struct sc_hypervisor_policy_task_pool
 };
 
 /* add task information to a task wrapper linked list */
-	void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools, size_t data_size);
+void sc_hypervisor_policy_add_task_to_pool(struct starpu_codelet *cl, unsigned sched_ctx, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools, size_t data_size);
 
 /* remove task information from a task wrapper linked list */
 void sc_hypervisor_policy_remove_task_from_pool(struct starpu_task *task, uint32_t footprint, struct sc_hypervisor_policy_task_pool **task_pools);
@@ -63,7 +70,7 @@ int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum st
 int* sc_hypervisor_get_idlest_workers_in_list(int *start, int *workers, int nall_workers,  int *nworkers, enum starpu_worker_archtype arch);
 
 /* find workers that can be moved from a context (if the constraints of min, max, etc allow this) */
-unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch);
+int sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch);
 
 /* compute how many workers should be moved from this context */
 int sc_hypervisor_compute_nworkers_to_move(unsigned req_sched_ctx);
@@ -93,7 +100,13 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
 /* get the list of workers grouped by type */
-void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers]);
+void sc_hypervisor_group_workers_by_type(struct types_of_workers *tw, int *total_nw);
+
+/* get what type of worker corresponds to a certain index of types of workers */
+enum starpu_worker_archtype sc_hypervisor_get_arch_for_index(unsigned w, struct types_of_workers *tw);
+
+/* get the index of types of workers corresponding to the type of workers indicated */
+unsigned sc_hypervisor_get_index_for_arch(enum starpu_worker_archtype arch, struct types_of_workers *tw);
 
 /* check if we trigger resizing or not */
 unsigned sc_hypervisor_criteria_fulfilled(unsigned sched_ctx, int worker);
@@ -107,6 +120,9 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void);
 /* check what triggers resizing (idle, speed, etc.)*/
 unsigned sc_hypervisor_get_resize_criteria();
 
+/* load information concerning the type of workers into a types_of_workers struct */
+struct types_of_workers* sc_hypervisor_get_types_of_workers(int *workers, unsigned nworkers);
+
 #ifdef __cplusplus
 }
 #endif

+ 33 - 29
sc_hypervisor/src/hypervisor_policies/debit_lp_policy.c

@@ -20,14 +20,14 @@
 #include <math.h>
 #include <sys/time.h>
 
-static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer);
+static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], unsigned integer);
 
 
-static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], int *in_sched_ctxs, int *workers)
+static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], unsigned *in_sched_ctxs, int *workers)
 {
 	double speed[ns][nw];
 
-	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+	unsigned *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	int w,s;
 
@@ -50,7 +50,7 @@ static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], int *i
 	struct timeval end_time;
 	gettimeofday(&start_time, NULL);
 
-	double res = _glp_resolve(ns, nw, speed, w_in_s, workers, 1);
+	double res = _glp_resolve(ns, nw, speed, w_in_s, 1);
 	gettimeofday(&end_time, NULL);
 
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
@@ -68,9 +68,9 @@ static unsigned _compute_max_speed(int ns, int nw, double w_in_s[ns][nw], int *i
  */
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>
-static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], int *workers, unsigned integer)
+static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[ns][nw], unsigned integer)
 {
-	int w, s;
+	int w = 0, s = 0;
 	glp_prob *lp;
 
 	lp = glp_create_prob();
@@ -225,10 +225,10 @@ static double _glp_resolve(int ns, int nw, double speed[ns][nw], double w_in_s[n
 }
 
 
-static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
-	int nw = workers == NULL ? starpu_worker_get_count() : nworkers; /* Number of different workers */
+	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
 	
 	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 
@@ -237,16 +237,17 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
 	{
+		struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nw);
 		int w, s;
-		double nworkers_per_ctx[ns][2];
-		int nworkers_per_ctx_rounded[ns][2];
+		double nworkers_per_ctx[ns][tw->nw];
+		int nworkers_per_ctx_rounded[ns][tw->nw];
 		for(s = 0; s < ns; s++)
 		{
-			nworkers_per_ctx[s][0] = 0.0;
-			nworkers_per_ctx[s][1] = 0.0;
-			nworkers_per_ctx_rounded[s][0] = 0;
-			nworkers_per_ctx_rounded[s][1] = 0;
-			
+			for(w = 0; w < nw; w++)
+			{
+				nworkers_per_ctx[s][w] = 0.0;
+				nworkers_per_ctx_rounded[s][w] = 0;
+			}
 		}
 		
 		for(s = 0; s < ns; s++)
@@ -254,18 +255,20 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 			for(w = 0; w < nw; w++)
 			{
 				enum starpu_worker_archtype arch = starpu_worker_get_type(w);
+				int idx = sc_hypervisor_get_index_for_arch(STARPU_CUDA_WORKER, tw);
+				nworkers_per_ctx[s][idx] += w_in_s[s][w];
 				
 				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers_per_ctx[s][0] += w_in_s[s][w];
 					if(w_in_s[s][w] >= 0.3)
-						nworkers_per_ctx_rounded[s][0]++;
+						nworkers_per_ctx_rounded[s][idx]++;
 				}
 				else
 				{
-					nworkers_per_ctx[s][1] += w_in_s[s][w];
+					int idx = sc_hypervisor_get_index_for_arch(STARPU_CPU_WORKER, tw);
+					nworkers_per_ctx[s][idx] += w_in_s[s][w];
 					if(w_in_s[s][w] > 0.5)
-						nworkers_per_ctx_rounded[s][1]++;
+						nworkers_per_ctx_rounded[s][idx]++;
 				}
 			}
 		}
@@ -273,12 +276,14 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 		
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
+
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, tw->nw, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs, tw);
 		
 	}
 }
 
-static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void debit_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
+				       __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
         if(ret != EBUSY)
@@ -295,7 +300,7 @@ static void debit_lp_handle_poped_task(unsigned sched_ctx, int worker, struct st
 	}
 }
 
-static debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+static void debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
         if(ret != EBUSY)
@@ -307,15 +312,14 @@ static debit_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
                         {
                                 _try_resizing(NULL, -1, NULL, -1);
-//                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);                                                                                                               \
-                                                                                                                                                                                                                    
-                        }
+//                              sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
+			}
                 }
                 starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
         }
 }
 
-static void debit_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void debit_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
@@ -325,10 +329,10 @@ static void debit_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers
 	}
 }
 
-static void debit_lp_end_ctx(unsigned sched_ctx)
+static void debit_lp_end_ctx(__attribute__((unused))unsigned sched_ctx)
 {
-	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
-	int worker;
+/* 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx); */
+/* 	int worker; */
 /* 	for(worker = 0; worker < 12; worker++) */
 /* 		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_speed[worker]); */
 

+ 28 - 28
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -20,28 +20,27 @@
 #include <sys/time.h>
 
 #ifdef STARPU_HAVE_GLPK_H
-static void _try_resizing(int *sched_ctxs, int nsched_ctxs)
+static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	/* for vite */
 	starpu_trace_user_event(2);
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
-	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;
+	
+	struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, curr_nworkers);
+	int nw = tw->nw;
+	double nworkers_per_ctx[ns][nw];
 
-	double nworkers_per_ctx[ns][2];
-	int nw = 1;
-#ifdef STARPU_USE_CUDA
-	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
-	nw = ncuda != 0 ? 2 : 1;
-#endif
 	int total_nw[nw];
-	sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
+	sc_hypervisor_group_workers_by_type(tw, total_nw);
 	
 	
 	struct timeval start_time;
 	struct timeval end_time;
 	gettimeofday(&start_time, NULL);
 	
-	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_ctx, total_nw);
+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_ctx, total_nw, tw);
 	gettimeofday(&end_time, NULL);
 	
 	long diff_s = end_time.tv_sec  - start_time.tv_sec;
@@ -53,7 +52,7 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs)
 	{
 		int nworkers_per_ctx_rounded[nsched_ctxs][nw];
 		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_ctx, nworkers_per_ctx_rounded);
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
 	}
 }
 
@@ -68,27 +67,29 @@ static void feft_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx,
 		{
 			if(sc_hypervisor_check_speed_gap_btw_ctxs())
 			{
-				_try_resizing(NULL, -1);
+				_try_resizing(NULL, -1, NULL, -1);
 			}
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 
 }
-static void feft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+static void feft_lp_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
-	int nw = 1;
-#ifdef STARPU_USE_CUDA
-	int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
-	nw = ncuda != 0 ? 2 : 1;
-#endif
+	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;
+	
+	struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, curr_nworkers);
+	int nw = tw->nw;
 	double nworkers_per_type[ns][nw];
+
 	int total_nw[nw];
-	sc_hypervisor_group_workers_by_type(workers, nworkers, nw, total_nw);
+	sc_hypervisor_group_workers_by_type(tw, total_nw);
+	
 
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_type, total_nw);
+	double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_type, total_nw, tw);
 	if(vmax != 0.0)
 	{
 // 		printf("********size\n");
@@ -113,13 +114,12 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, in
 /* 				printf("ctx %d/worker type %d: n = %d \n", i, 1, nworkers_per_type_rounded[i][1]); */
 /* #endif */
 /* 		} */
-		int *current_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 
 		unsigned has_workers = 0;
 		int s;
 		for(s = 0; s < ns; s++)
 		{
-			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(current_sched_ctxs[s], STARPU_ANY_WORKER);
+			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(curr_sched_ctxs[s], STARPU_ANY_WORKER);
 			if(nworkers_ctx != 0)
 			{
 				has_workers = 1;
@@ -127,9 +127,9 @@ static void feft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, in
 			}
 		}
 		if(has_workers)
-			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_type_rounded, nworkers_per_type, current_sched_ctxs);
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_type_rounded, nworkers_per_type, curr_sched_ctxs, tw);
 		else
-			sc_hypervisor_lp_distribute_resources_in_ctxs(sched_ctxs, ns, nw, nworkers_per_type_rounded, nworkers_per_type, workers, nworkers);
+			sc_hypervisor_lp_distribute_resources_in_ctxs(sched_ctxs, ns, nw, nworkers_per_type_rounded, nworkers_per_type, workers, curr_nworkers, tw);
 	}
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
@@ -145,7 +145,7 @@ static void feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 			
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
 			{
-				_try_resizing(NULL, -1);
+				_try_resizing(NULL, -1, NULL, -1);
 //				sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
 			}
 		}
@@ -153,8 +153,8 @@ static void feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 	}
 }
 
-static void feft_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , 
-				__attribute__((unused))int *workers, __attribute__((unused))int nworkers)
+static void feft_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , 
+				int *workers, int nworkers)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)
@@ -172,7 +172,7 @@ static void feft_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs ,
 			 }
 		}
 
-		_try_resizing(sched_ctxs, nsched_ctxs);
+		_try_resizing(sched_ctxs, nsched_ctxs, workers, nworkers);
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 }

+ 10 - 9
sc_hypervisor/src/hypervisor_policies/gflops_rate_policy.c

@@ -70,11 +70,11 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
         if(nworkers_needed > 0)
         {
                 struct sc_hypervisor_policy_config *sender_config = sc_hypervisor_get_config(sender_sched_ctx);
-                unsigned potential_moving_cpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
-                unsigned potential_moving_gpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
-                unsigned sender_nworkers = starpu_sched_ctx_get_nworkers(sender_sched_ctx);
+                int potential_moving_cpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CPU_WORKER);
+                int potential_moving_gpus = sc_hypervisor_get_movable_nworkers(sender_config, sender_sched_ctx, STARPU_CUDA_WORKER);
+                int sender_nworkers = (int)starpu_sched_ctx_get_nworkers(sender_sched_ctx);
                 struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(receiver_sched_ctx);
-                unsigned nworkers_ctx = starpu_sched_ctx_get_nworkers(receiver_sched_ctx);
+                int nworkers_ctx = (int)starpu_sched_ctx_get_nworkers(receiver_sched_ctx);
 
                 if(nworkers_needed < (potential_moving_cpus + 5 * potential_moving_gpus))
                 {
@@ -119,7 +119,7 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 
                         if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers)
                         {
-                                unsigned nshared_workers = starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx);
+                                int nshared_workers = (int)starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx);
                                 if((nworkers_ctx + nworkers_to_move - nshared_workers) > config->max_nworkers)
                                         nworkers_to_move = nworkers_ctx > config->max_nworkers ? 0 : (config->max_nworkers - nworkers_ctx + nshared_workers);
 
@@ -165,11 +165,11 @@ static unsigned _gflops_rate_resize(unsigned sender_sched_ctx, unsigned receiver
 
 static int _find_fastest_sched_ctx()
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double first_exp_end = _get_exp_end(sched_ctxs[0]);
-	int fastest_sched_ctx = first_exp_end == -1.0  ? -1 : sched_ctxs[0];
+	int fastest_sched_ctx = first_exp_end == -1.0  ? -1 : (int)sched_ctxs[0];
 	double curr_exp_end = 0.0;
 	int i;
 	for(i = 1; i < nsched_ctxs; i++)
@@ -188,7 +188,7 @@ static int _find_fastest_sched_ctx()
 
 static int _find_slowest_sched_ctx()
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int slowest_sched_ctx = -1;
@@ -214,7 +214,7 @@ static int _find_slowest_sched_ctx()
 
 static int _find_slowest_available_sched_ctx(unsigned sched_ctx)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int slowest_sched_ctx = -1;
@@ -296,6 +296,7 @@ static void gflops_rate_handle_poped_task(unsigned sched_ctx, int worker)
 
 struct sc_hypervisor_policy gflops_rate_policy = {
 	.size_ctxs = NULL,
+	.resize_ctxs = NULL,
 	.handle_poped_task = gflops_rate_handle_poped_task,
 	.handle_pushed_task = NULL,
 	.handle_idle_cycle = NULL,

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/idle_policy.c

@@ -18,7 +18,7 @@
 
 unsigned worker_belong_to_other_sched_ctx(unsigned sched_ctx, int worker)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int i;

+ 25 - 23
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -253,7 +253,7 @@ static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
 	return res;
 }
 
-static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, int *in_sched_ctxs, int *workers)
+static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, unsigned *sched_ctxs, int *workers)
 {
 //	double flops[ns];
 //	double speed[ns][nw];
@@ -262,8 +262,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	int i;
 	for(i = 0; i < ns; i++)
 		speed[i] = (double*)malloc(nw*sizeof(double));
-
-	int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 	
 	int w,s;
 
@@ -286,7 +284,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
 					if(!worker_in_ctx)
 					{
-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)) / 1000;
 						speed[s][w] = (speed[s][w] * transfer_speed) / (speed[s][w] + transfer_speed);
 					}
 				}
@@ -323,13 +321,17 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	return found_sol;
 }
 
-static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+	starpu_trace_user_event(2);
+        int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int nw = nworkers == -1 ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
+        unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
+
+        struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nw);
+        int ntypes_of_workers = tw->nw;
+
 
-	sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
-	
 	double w_in_s[ns][nw];
 //			double flops_on_w[ns][nw];
 	double **flops_on_w = (double**)malloc(ns*sizeof(double*));
@@ -337,20 +339,20 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 	for(i = 0; i < ns; i++)
 		flops_on_w[i] = (double*)malloc(nw*sizeof(double));
 	
-	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, sched_ctxs, workers);
+	unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw,  w_in_s, flops_on_w, curr_sched_ctxs, workers);
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
 	{
 		int w, s;
-		double nworkers_per_ctx[ns][2];
-		int nworkers_per_ctx_rounded[ns][2];
+		double nworkers_per_ctx[ns][ntypes_of_workers];
+		int nworkers_per_ctx_rounded[ns][ntypes_of_workers];
 		for(s = 0; s < ns; s++)
 		{
-			nworkers_per_ctx[s][0] = 0.0;
-			nworkers_per_ctx[s][1] = 0.0;
-			nworkers_per_ctx_rounded[s][0] = 0;
-			nworkers_per_ctx_rounded[s][1] = 0;
-			
+			for(w = 0; w < ntypes_of_workers; w++)
+			{
+				nworkers_per_ctx[s][w] = 0.0;
+				nworkers_per_ctx_rounded[s][w] = 0;
+			}
 		}
 		
 		for(s = 0; s < ns; s++)
@@ -358,18 +360,18 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 			for(w = 0; w < nw; w++)
 			{
 				enum starpu_worker_archtype arch = starpu_worker_get_type(w);
-				
+		
+				int idx = sc_hypervisor_get_index_for_arch(arch, tw);
+				nworkers_per_ctx[s][idx] += w_in_s[s][w];
 				if(arch == STARPU_CUDA_WORKER)
 				{
-					nworkers_per_ctx[s][0] += w_in_s[s][w];
 					if(w_in_s[s][w] >= 0.3)
-						nworkers_per_ctx_rounded[s][0]++;
+						nworkers_per_ctx_rounded[s][idx]++;
 				}
 				else
 				{
-					nworkers_per_ctx[s][1] += w_in_s[s][w];
 					if(w_in_s[s][w] > 0.5)
-						nworkers_per_ctx_rounded[s][1]++;
+						nworkers_per_ctx_rounded[s][idx]++;
 				}
 			}
 		}
@@ -377,7 +379,7 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 /* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
 /* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
 		
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_per_ctx_rounded, nworkers_per_ctx, sched_ctxs);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, ntypes_of_workers, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
 	}
 	for(i = 0; i < ns; i++)
 		free(flops_on_w[i]);
@@ -421,7 +423,7 @@ static void ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
         }
 }
 
-static void ispeed_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void ispeed_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)

+ 3 - 3
sc_hypervisor/src/hypervisor_policies/ispeed_policy.c

@@ -18,7 +18,7 @@
 
 static unsigned _get_fastest_sched_ctx(void)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	int fastest_sched_ctx = STARPU_NMAX_SCHED_CTXS;
@@ -40,7 +40,7 @@ static unsigned _get_fastest_sched_ctx(void)
 
 static unsigned _get_slowest_sched_ctx(void)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double smallest_speed = sc_hypervisor_get_ctx_speed(sc_hypervisor_get_wrapper(sched_ctxs[0]));
@@ -141,7 +141,7 @@ static int* _get_slowest_workers(unsigned sched_ctx, int *nworkers, enum starpu_
 	return curr_workers;
 }			
 
-static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void ispeed_handle_poped_task(unsigned sched_ctx, int worker, __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)

+ 19 - 12
sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c

@@ -28,7 +28,7 @@ struct teft_lp_data
 {
 	int nt;
 	double **tasks;
-	int *in_sched_ctxs;
+	unsigned *in_sched_ctxs;
 	int *workers;
 	struct sc_hypervisor_policy_task_pool *tmp_task_pools;
 	unsigned size_ctxs;
@@ -41,7 +41,7 @@ static double _compute_workers_distrib(int ns, int nw, double final_w_in_s[ns][n
 
 	int nt = sd->nt;
 	double **final_tasks = sd->tasks;
-	int *in_sched_ctxs = sd->in_sched_ctxs;
+	unsigned *in_sched_ctxs = sd->in_sched_ctxs;
 	int *workers = sd->workers;
 	struct sc_hypervisor_policy_task_pool *tmp_task_pools = sd->tmp_task_pools;
 	unsigned size_ctxs = sd->size_ctxs;
@@ -74,7 +74,7 @@ static double _compute_workers_distrib(int ns, int nw, double final_w_in_s[ns][n
 	return res;
 }
 	
-static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void _size_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
@@ -114,7 +114,10 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 	starpu_pthread_mutex_unlock(&mutex);
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
-		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 1);
+	{
+		struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nw);
+		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 1, tw);
+	}
 	
 	for(i = 0; i < nw; i++)
 		free(tasks[i]);
@@ -125,7 +128,8 @@ static void _size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nwor
 static void size_if_required()
 {
 	int nsched_ctxs, nworkers;
-	int *sched_ctxs, *workers;
+	unsigned *sched_ctxs;
+	int *workers;
 	unsigned has_req = sc_hypervisor_get_size_req(&sched_ctxs, &nsched_ctxs, &workers, &nworkers);
 
 	if(has_req)
@@ -161,7 +165,7 @@ static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sch
 	size_if_required();
 }
 
-static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	starpu_trace_user_event(2);
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
@@ -214,7 +218,10 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 	
 	/* if we did find at least one solution redistribute the resources */
 	if(found_sol)
-		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 0);
+	{
+		struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nw);
+		sc_hypervisor_lp_place_resources_in_ctx(ns, nw, w_in_s, sched_ctxs, workers, 0, tw);
+	}
 	
 	struct sc_hypervisor_policy_task_pool *next = NULL;
 	struct sc_hypervisor_policy_task_pool *tmp_tp = tmp_task_pools;
@@ -230,7 +237,7 @@ static void _try_resizing(int *sched_ctxs, int nsched_ctxs , int *workers, int n
 	free(tasks_per_worker);
 }
 
-static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
+static void teft_lp_handle_poped_task(unsigned sched_ctx, __attribute__((unused))int worker, struct starpu_task *task, uint32_t footprint)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 
@@ -262,7 +269,7 @@ static void teft_lp_handle_poped_task(unsigned sched_ctx, int worker, struct sta
 
 }
 
-static int teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
+static void teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
 	struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
 
@@ -288,15 +295,15 @@ static int teft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 		}
 		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
-	return 0;
+	return;
 }
 
-static void teft_lp_size_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void teft_lp_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	sc_hypervisor_save_size_req(sched_ctxs, nsched_ctxs, workers, nworkers);
 }
 
-static void teft_lp_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+static void teft_lp_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 	if(ret != EBUSY)

+ 3 - 3
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -23,7 +23,7 @@
 #ifdef STARPU_HAVE_GLPK_H
 
 double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_in_s[ns][nw], double tasks[nw][nt],
-					       double times[nw][nt], unsigned is_integer, double tmax, int *in_sched_ctxs,
+					       double times[nw][nt], unsigned is_integer, double tmax, unsigned *in_sched_ctxs,
 					       struct sc_hypervisor_policy_task_pool *tmp_task_pools)
 {
 	struct sc_hypervisor_policy_task_pool * tp;
@@ -80,7 +80,7 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 					glp_set_col_bnds(lp, nw*nt+s*nw+w+1, GLP_DB, 0.0, 1.0);
 			}
 
-		int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
+		unsigned *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
 
 		int curr_row_idx = 0;
 		/* Total worker execution time */
@@ -110,7 +110,7 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
 				for (t = 0, tp = tmp_task_pools; tp; t++, tp = tp->next)
 				{
-					if((int)tp->sched_ctx_id == sched_ctxs[s])
+					if(tp->sched_ctx_id == sched_ctxs[s])
 					{
 						ia[n] = curr_row_idx+s*nw+w+1;
 						ja[n] = colnum(w, t);

+ 88 - 76
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -25,30 +25,35 @@
 
 #endif //STARPU_HAVE_GLPK_H
 
-double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], int total_nw[ntypes_of_workers])
+double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_workers, double res[nsched_ctxs][ntypes_of_workers], 
+					     int total_nw[ntypes_of_workers], struct types_of_workers *tw)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 #ifdef STARPU_HAVE_GLPK_H
 	double v[nsched_ctxs][ntypes_of_workers];
 	double flops[nsched_ctxs];
 
+	int nw = tw->nw;
 	int i = 0;
 	struct sc_hypervisor_wrapper* sc_w;
 	for(i = 0; i < nsched_ctxs; i++)
 	{
 		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
-#ifdef STARPU_USE_CUDA
-		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
-		if(ncuda != 0)
-		{
-			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER);
-			v[i][1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
-		}
-		else
-			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
-#else
-		v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
-#endif // STARPU_USE_CUDA
+/* #ifdef STARPU_USE_CUDA */
+/* 		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER); */
+/* 		if(ncuda != 0) */
+/* 		{ */
+/* 			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER); */
+/* 			v[i][1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
+/* 		} */
+/* 		else */
+/* 			v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
+/* #else */
+/* 		v[i][0] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER); */
+/* #endif // STARPU_USE_CUDA */
+		int w;
+		for(w = 0; w < nw; w++)
+			v[i][w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
 		
 		flops[i] = sc_w->remaining_flops < 0.0 ? 0.0 : sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
 //		printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
@@ -74,16 +79,18 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 #endif//STARPU_HAVE_GLPK_H
 }
 
-double sc_hypervisor_lp_get_tmax(int nw, int *workers)
+double sc_hypervisor_lp_get_tmax(int nworkers, int *workers)
 {
-	int ntypes_of_workers = 2;
-	int total_nw[ntypes_of_workers];
-	sc_hypervisor_group_workers_by_type(workers, nw, 2, total_nw);
+	struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nworkers);
+        int nw = tw->nw;
+
+        int total_nw[nw];
+        sc_hypervisor_group_workers_by_type(tw, total_nw);
 
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
-	double res[nsched_ctxs][ntypes_of_workers];
-	return sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, ntypes_of_workers, res, total_nw) * 1000.0;
+	double res[nsched_ctxs][nw];
+	return sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, res, total_nw, tw) * 1000.0;
 }
 
 void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
@@ -147,22 +154,25 @@ void sc_hypervisor_lp_round_double_to_int(int ns, int nw, double res[ns][nw], in
 void _lp_find_workers_to_give_away(int nw, int ns, unsigned sched_ctx, int sched_ctx_idx, 
 				  int tmp_nw_move[nw], int tmp_workers_move[nw][STARPU_NMAXWORKERS], 
 				  int tmp_nw_add[nw], int tmp_workers_add[nw][STARPU_NMAXWORKERS],
-				  int res_rounded[ns][nw], double res[ns][nw])
+				   int res_rounded[ns][nw], double res[ns][nw], struct types_of_workers *tw)
 {
 	int w;
+	double target_res = 0.0;
+	for(w = 0; w < nw; w++)
+		target_res += res[sched_ctx_idx][w];
+
 	for(w = 0; w < nw; w++)
 	{
-		enum starpu_worker_archtype arch = STARPU_ANY_WORKER;
-		if(w == 0) arch = STARPU_CUDA_WORKER;
-		if(w == 1) arch = STARPU_CPU_WORKER;
-		
+		enum starpu_worker_archtype arch = sc_hypervisor_get_arch_for_index(w, tw);
 		
-		if(w == 1)
+		if(arch == STARPU_CPU_WORKER) 
 		{
 			int nworkers_ctx = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch);
 			if(nworkers_ctx > res_rounded[sched_ctx_idx][w])
 			{
 				int nworkers_to_move = nworkers_ctx - res_rounded[sched_ctx_idx][w];
+				if(target_res == 0.0 && nworkers_to_move > 0)
+					nworkers_to_move--;
 				int *workers_to_move = sc_hypervisor_get_idlest_workers(sched_ctx, &nworkers_to_move, arch);
 				int i;
 				for(i = 0; i < nworkers_to_move; i++)
@@ -220,15 +230,13 @@ void _lp_find_workers_to_accept(int nw, int ns, unsigned sched_ctx, int sched_ct
 				int tmp_nw_add[nw], int tmp_workers_add[nw][STARPU_NMAXWORKERS],
 				int *nw_move, int workers_move[STARPU_NMAXWORKERS], 
 				int *nw_add, int workers_add[STARPU_NMAXWORKERS],
-				int res_rounded[ns][nw], double res[ns][nw])
+				int res_rounded[ns][nw], double res[ns][nw], struct types_of_workers *tw)
 {
 	int w;
 	int j = 0, k = 0;
 	for(w = 0; w < nw; w++)
 	{
-		enum starpu_worker_archtype arch = STARPU_ANY_WORKER;
-		if(w == 0) arch = STARPU_CUDA_WORKER;
-		if(w == 1) arch = STARPU_CPU_WORKER;
+		enum starpu_worker_archtype arch = sc_hypervisor_get_arch_for_index(w, tw);
 		
 		int nw_ctx2 = sc_hypervisor_get_nworkers_ctx(sched_ctx, arch);
 		int nw_needed = res_rounded[sched_ctx_idx][w] - nw_ctx2;
@@ -299,7 +307,7 @@ void _lp_find_workers_to_remove(int nw, int tmp_nw_move[nw], int tmp_workers_mov
 	}
 }
 
-void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *sched_ctxs)
+void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], unsigned *sched_ctxs, struct types_of_workers *tw)
 {
 	int s, s2, w;
 	for(s = 0; s < ns; s++)
@@ -326,7 +334,7 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 		/* find workers that ctx s has to give away */
 		_lp_find_workers_to_give_away(nw, ns, sched_ctxs[s], s, 
 					      tmp_nw_move, tmp_workers_move, 
-					      tmp_nw_add, tmp_workers_add, res_rounded, res);
+					      tmp_nw_add, tmp_workers_add, res_rounded, res, tw);
 
 		for(s2 = 0; s2 < ns; s2++)
 		{
@@ -346,7 +354,7 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 							   tmp_nw_add, tmp_workers_add,
 							   &nw_move, workers_move, 
 							   &nw_add, workers_add,
-							   res_rounded, res);
+							   res_rounded, res, tw);
 				
 				if(nw_move > 0)
 				{
@@ -374,9 +382,8 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 	}
 }
 
-void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers)
+void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers, struct types_of_workers *tw)
 {
-	unsigned current_nworkers = workers == NULL ? starpu_worker_get_count() : (unsigned)nworkers;
 	int s, w;
 	int start[nw];
 	for(w = 0; w < nw; w++)
@@ -385,33 +392,36 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 	{
 		int workers_add[STARPU_NMAXWORKERS];
                 int nw_add = 0;
-		
+		double target_res = 0.0;
 		for(w = 0; w < nw; w++)
-		{
-			enum starpu_worker_archtype arch;
+			target_res += res[s][w];
 
-#ifdef STARPU_USE_CUDA
-			int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
-			if(ncuda != 0)
-			{
-				if(w == 0) arch = STARPU_CUDA_WORKER;
-				if(w == 1) arch = STARPU_CPU_WORKER;
-			}
-			else
-				if(w == 0) arch = STARPU_CPU_WORKER;
-#else
-			if(w == 0) arch = STARPU_CPU_WORKER;
-#endif //STARPU_USE_CUDA
-			if(w == 1)
+		for(w = 0; w < nw; w++)
+		{
+			enum starpu_worker_archtype arch = sc_hypervisor_get_arch_for_index(w, tw);
+			
+			if(arch == STARPU_CPU_WORKER) 
 			{
 				int nworkers_to_add = res_rounded[s][w];
-				int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &nworkers_to_add, arch);
-				int i;
-				for(i = 0; i < nworkers_to_add; i++)
-					workers_add[nw_add++] = workers_to_add[i];
-				free(workers_to_add);
+				if(target_res == 0.0)
+				{
+					nworkers_to_add=1;
+					start[w]--;
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
+					int i;
+					for(i = 0; i < nworkers_to_add; i++)
+						workers_add[nw_add++] = workers_to_add[i];
+					free(workers_to_add);
+				}
+				else
+				{
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
+					int i;
+					for(i = 0; i < nworkers_to_add; i++)
+						workers_add[nw_add++] = workers_to_add[i];
+					free(workers_to_add);
+				}
 			}
-			
 			else
 			{
 				double nworkers_to_add = res[s][w];
@@ -420,7 +430,7 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 				double diff = nworkers_to_add - x_double;
 				if(diff == 0.0)
 				{
-					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &x, arch);
 					int i;
 					for(i = 0; i < x; i++)
 						workers_add[nw_add++] = workers_to_add[i];
@@ -429,7 +439,7 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 				else
 				{
 					x+=1;
-					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, current_nworkers, &x, arch);
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &x, arch);
 					int i;
 					if(diff >= 0.3)
 						for(i = 0; i < x; i++)
@@ -437,7 +447,7 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 					else
 						for(i = 0; i < x-1; i++)
 							workers_add[nw_add++] = workers_to_add[i];
-
+					
 					free(workers_to_add);
 				}
 			}
@@ -453,17 +463,19 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(int* sched_ctxs, int ns, int
 }
 
 /* nw = all the workers (either in a list or on all machine) */
-void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], int *sched_ctxs_input, int *workers_input, unsigned do_size)
+void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], unsigned *sched_ctxs_input, int *workers_input, unsigned do_size, struct types_of_workers *tw)
 {
 	int w, s;
-	double nworkers[ns][2];
-	int nworkers_rounded[ns][2];
+	int ntypes_of_workers = tw->nw; 
+	double nworkers[ns][ntypes_of_workers];
+	int nworkers_rounded[ns][ntypes_of_workers];
 	for(s = 0; s < ns; s++)
 	{
-		nworkers[s][0] = 0.0;
-		nworkers[s][1] = 0.0;
-		nworkers_rounded[s][0] = 0;
-		nworkers_rounded[s][1] = 0;
+		for(w = 0; w < ntypes_of_workers; w++)
+		{
+			nworkers[s][w] = 0.0;
+			nworkers_rounded[s][w] = 0;
+		}
 		
 	}
 	
@@ -472,27 +484,27 @@ void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][n
 		for(w = 0; w < nw; w++)
 		{
 			enum starpu_worker_archtype arch = starpu_worker_get_type(w);
-			
+			int idx = sc_hypervisor_get_index_for_arch(arch, tw);
+			nworkers[s][idx] += w_in_s[s][w];
+				
 			if(arch == STARPU_CUDA_WORKER)
 			{
-				nworkers[s][0] += w_in_s[s][w];
 				if(w_in_s[s][w] >= 0.3)
-					nworkers_rounded[s][0]++;
+					nworkers_rounded[s][idx]++;
 			}
 			else
 			{
-				nworkers[s][1] += w_in_s[s][w];
 				if(w_in_s[s][w] > 0.5)
-					nworkers_rounded[s][1]++;
+					nworkers_rounded[s][idx]++;
 			}
 		}
 	}
 	
 	if(!do_size)
-		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers, sched_ctxs_input);
+		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, ntypes_of_workers, nworkers_rounded, nworkers, sched_ctxs_input, tw);
 	else
 	{
-		int *current_sched_ctxs = sched_ctxs_input == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs_input;
+		unsigned *current_sched_ctxs = sched_ctxs_input == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs_input;
 
 		unsigned has_workers = 0;
 		for(s = 0; s < ns; s++)
@@ -506,9 +518,9 @@ void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][n
 			}
 		}
 		if(has_workers)
-			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers, current_sched_ctxs);
+			sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, ntypes_of_workers, nworkers_rounded, nworkers, current_sched_ctxs, tw);
 		else
-			sc_hypervisor_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, 2, nworkers_rounded, nworkers, workers_input, nw);
+			sc_hypervisor_lp_distribute_resources_in_ctxs(current_sched_ctxs, ns, ntypes_of_workers, nworkers_rounded, nworkers, workers_input, nw, tw);
 	}
 	return;
 }

+ 86 - 103
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -47,7 +47,7 @@ unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nw
 	int highest_priority = -1;
 	int current_priority = 0;
 	unsigned sched_ctx = STARPU_NMAX_SCHED_CTXS;
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 
@@ -57,7 +57,7 @@ unsigned sc_hypervisor_find_lowest_prio_sched_ctx(unsigned req_sched_ctx, int nw
 	{
 		if(sched_ctxs[i] != STARPU_NMAX_SCHED_CTXS && sched_ctxs[i] != req_sched_ctx)
 		{
-			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctxs[i]);
+			int nworkers = (int)starpu_sched_ctx_get_nworkers(sched_ctxs[i]);
 			config  = sc_hypervisor_get_config(sched_ctxs[i]);
 			if((nworkers + nworkers_to_move) <= config->max_nworkers)
 			{
@@ -177,11 +177,11 @@ int* sc_hypervisor_get_idlest_workers(unsigned sched_ctx, int *nworkers, enum st
 }
 
 /* get the number of workers in the context that are allowed to be moved (that are not fixed) */
-unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch)
+int sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *config, unsigned sched_ctx, enum starpu_worker_archtype arch)
 {
 	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
 
-	unsigned potential_workers = 0;
+	int potential_workers = 0;
 	int worker;
 
 	struct starpu_sched_ctx_iterator it;
@@ -207,10 +207,10 @@ unsigned sc_hypervisor_get_movable_nworkers(struct sc_hypervisor_policy_config *
 int sc_hypervisor_compute_nworkers_to_move(unsigned req_sched_ctx)
 {
        	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(req_sched_ctx);
-	unsigned nworkers = starpu_sched_ctx_get_nworkers(req_sched_ctx);
-	unsigned nworkers_to_move = 0;
+	int nworkers = (int)starpu_sched_ctx_get_nworkers(req_sched_ctx);
+	int nworkers_to_move = 0;
 
-	unsigned potential_moving_workers = sc_hypervisor_get_movable_nworkers(config, req_sched_ctx, STARPU_ANY_WORKER);
+	int potential_moving_workers = (int)sc_hypervisor_get_movable_nworkers(config, req_sched_ctx, STARPU_ANY_WORKER);
 	if(potential_moving_workers > 0)
 	{
 		if(potential_moving_workers <= config->min_nworkers)
@@ -263,14 +263,14 @@ unsigned sc_hypervisor_policy_resize(unsigned sender_sched_ctx, unsigned receive
 			unsigned poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
 			if(receiver_sched_ctx == STARPU_NMAX_SCHED_CTXS)
 			{
-				poor_sched_ctx = sc_hypervisor_find_lowest_prio_sched_ctx(sender_sched_ctx, nworkers_to_move);
+				poor_sched_ctx = sc_hypervisor_find_lowest_prio_sched_ctx(sender_sched_ctx, (unsigned)nworkers_to_move);
 			}
 			else
 			{
 				poor_sched_ctx = receiver_sched_ctx;
 				struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(poor_sched_ctx);
-				unsigned nworkers = starpu_sched_ctx_get_nworkers(poor_sched_ctx);
-				unsigned nshared_workers = starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, poor_sched_ctx);
+				int nworkers = (int)starpu_sched_ctx_get_nworkers(poor_sched_ctx);
+				int nshared_workers = (int)starpu_sched_ctx_get_nshared_workers(sender_sched_ctx, poor_sched_ctx);
 				if((nworkers+nworkers_to_move-nshared_workers) > config->max_nworkers)
 					nworkers_to_move = nworkers > config->max_nworkers ? 0 : (config->max_nworkers - nworkers+nshared_workers);
 				if(nworkers_to_move == 0) poor_sched_ctx = STARPU_NMAX_SCHED_CTXS;
@@ -301,56 +301,9 @@ unsigned sc_hypervisor_policy_resize_to_unknown_receiver(unsigned sender_sched_c
 	return sc_hypervisor_policy_resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
 }
 
-static double _get_ispeed_sample_for_type_of_worker(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype req_arch)
-{
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
-        int worker;
-
-	double avg = 0.0;
-	int n = 0;
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-                workers->init_iterator(workers, &it);
-
-        while(workers->has_next(workers, &it))
-	{
-                worker = workers->get_next(workers, &it);
-                enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
-                if(arch == req_arch)
-                {
-			struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
-			avg += config->ispeed_w_sample[worker];
-			n++;
-		}
-        }
-
-	return n != 0 ? avg/n : 0;
-}
-
-static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
-{
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctx);
-	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctx);
-        
-	int worker;
-	double ispeed_sample = 0.0;
-	struct starpu_sched_ctx_iterator it;
-
-	if(workers->init_iterator)
-                workers->init_iterator(workers, &it);
-
-        while(workers->has_next(workers, &it))
-	{
-                worker = workers->get_next(workers, &it);
-	        ispeed_sample += config->ispeed_w_sample[worker];
-        }
-
-	return ispeed_sample;
-}
-
 double sc_hypervisor_get_slowest_ctx_exec_time(void)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 /* 	double curr_time = starpu_timing_now(); */
@@ -374,7 +327,7 @@ double sc_hypervisor_get_slowest_ctx_exec_time(void)
 
 double sc_hypervisor_get_fastest_ctx_exec_time(void)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
 	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
 
 	double curr_time = starpu_timing_now();
@@ -397,26 +350,58 @@ double sc_hypervisor_get_fastest_ctx_exec_time(void)
 	return fastest_time;
 }
 
-void sc_hypervisor_group_workers_by_type(int *workers, int nworkers, int ntypes_of_workers, int total_nw[ntypes_of_workers])
+void sc_hypervisor_group_workers_by_type(struct types_of_workers *tw, int *total_nw)
 {
-	int current_nworkers = workers == NULL ? starpu_worker_get_count() : nworkers;
 	int w;
-	for(w = 0; w < ntypes_of_workers; w++)
+	for(w = 0; w < tw->nw; w++)
 		total_nw[w] = 0;
 
-	for(w = 0; w < current_nworkers; w++)
+	if(tw->ncpus != 0)
+	{
+		total_nw[0] = tw->ncpus;
+		if(tw->ncuda != 0)
+			total_nw[1] = tw->ncuda;
+	}
+	else
+	{
+		if(tw->ncuda != 0)
+			total_nw[0] =tw->ncuda;
+	}
+
+}
+
+enum starpu_worker_archtype sc_hypervisor_get_arch_for_index(unsigned w, struct types_of_workers *tw)
+{
+	if(w == 0)
 	{
- 		enum starpu_worker_archtype arch = workers == NULL ? starpu_worker_get_type(w) :
-			starpu_worker_get_type(workers[w]);
-		if(ntypes_of_workers == 2)
+		if(tw->ncpus != 0)
+			return STARPU_CPU_WORKER;
+		else
+			return STARPU_CUDA_WORKER;
+	}
+	else
+		if(tw->ncuda != 0)
+			return STARPU_CUDA_WORKER;
+}
+
+
+unsigned sc_hypervisor_get_index_for_arch(enum starpu_worker_archtype arch, struct types_of_workers *tw)
+{
+	
+	if(arch == STARPU_CPU_WORKER)
+	{
+		if(tw->ncpus != 0)
+			return 0;
+	}
+	else
+	{
+		if(arch == STARPU_CUDA_WORKER)
 		{
-			if(arch == STARPU_CPU_WORKER)
-				total_nw[1]++;
+			if(tw->ncpus != 0)
+				return 1;
 			else
-				total_nw[0]++;
+				return 0;
 		}
-		else
-			total_nw[0]++;
 	}
 }
 
@@ -445,9 +430,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 				{
 					if(arch == STARPU_CUDA_WORKER)
 					{
-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker);
+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
 						transfer_time +=  (tp->data_size / transfer_speed) / 1000. ;
-						double latency = starpu_get_latency_RAM_CUDA(worker);
+						double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
 						transfer_time += latency/1000.;
 						
 						
@@ -456,9 +441,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 					{
 						if(!starpu_sched_ctx_contains_type_of_worker(arch, tp->sched_ctx_id))
 						{
-							double transfer_speed = starpu_get_bandwidth_CUDA_RAM(worker);
+							double transfer_speed = starpu_transfer_bandwidth(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
 							transfer_time += (tp->data_size / transfer_speed) / 1000. ;
-							double latency = starpu_get_latency_CUDA_RAM(worker);
+							double latency = starpu_transfer_latency(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
 							transfer_time += latency / 1000.;
 						}
 					}
@@ -491,16 +476,18 @@ unsigned sc_hypervisor_check_idle(unsigned sched_ctx, int worker)
 /* check if there is a big speed gap between the contexts */
 unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 {
-	int *sched_ctxs = sc_hypervisor_get_sched_ctxs();
-	int nsched_ctxs = sc_hypervisor_get_nsched_ctxs();
+	unsigned *sched_ctxs = sc_hypervisor_get_sched_ctxs();
+	int ns = sc_hypervisor_get_nsched_ctxs();
+	int *workers = NULL;
+	int nworkers = starpu_worker_get_count();
 	int i = 0, j = 0;
 	struct sc_hypervisor_wrapper* sc_w;
 	struct sc_hypervisor_wrapper* other_sc_w;
 
 	
-	double optimal_v[nsched_ctxs];
+	double optimal_v[ns];
 	unsigned has_opt_v = 1;
-	for(i = 0; i < nsched_ctxs; i++)
+	for(i = 0; i < ns; i++)
 	{
 		optimal_v[i] = _get_optimal_v(i);
 		if(optimal_v[i] == 0.0)
@@ -513,33 +500,29 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 /*if an optimal speed has not been computed yet do it now */
 	if(!has_opt_v)
 	{
-		int nw = 1;
-#ifdef STARPU_USE_CUDA
-		int ncuda = starpu_worker_get_count_by_type(STARPU_CUDA_WORKER);
-		nw = ncuda != 0 ? 2 : 1;
-#endif	
-		double nworkers_per_type[nsched_ctxs][nw];
+		struct types_of_workers *tw = sc_hypervisor_get_types_of_workers(workers, nworkers);
+		int nw = tw->nw;
+		double nworkers_per_ctx[ns][nw];
 		int total_nw[nw];
-		for(i = 0; i < nw; i++)
-		{
-			for(j = 0; j < nsched_ctxs; j++)
-				nworkers_per_type[j][i] = 0.0;
-			total_nw[i] = 0;
-		}
-		sc_hypervisor_group_workers_by_type(NULL, -1, nw, total_nw);
-		
-		double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(nsched_ctxs, nw, nworkers_per_type, total_nw);
+		sc_hypervisor_group_workers_by_type(tw, total_nw);
+
+		double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_ctx, total_nw, tw);
+
 		
 		if(vmax != 0.0)
 		{
-			for(i = 0; i < nsched_ctxs; i++)
+			for(i = 0; i < ns; i++)
 			{
 				sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 				double v[nw];
-				v[0] = sc_hypervisor_get_speed(sc_w, STARPU_CUDA_WORKER);
-				v[1] = sc_hypervisor_get_speed(sc_w, STARPU_CPU_WORKER);
-				
-				optimal_v[i] = nworkers_per_type[i][0] * v[0] + nworkers_per_type[i][1]* v[1];
+				optimal_v[i] = 0.0;
+				int w;
+				for(w = 0; w < nw; w++)
+				{
+					v[w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw));
+					
+					optimal_v[i] += nworkers_per_ctx[i][w];
+				}
 				_set_optimal_v(i, optimal_v[i]);
 			}
 			has_opt_v = 1;
@@ -550,7 +533,7 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
    theoretical one */
 	if(has_opt_v)
 	{
-		for(i = 0; i < nsched_ctxs; i++)
+		for(i = 0; i < ns; i++)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 			
@@ -559,7 +542,7 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 				return 0;
 		}
 
-		for(i = 0; i < nsched_ctxs; i++)
+		for(i = 0; i < ns; i++)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 			
@@ -572,13 +555,13 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(void)
 		SC_MAX_SPEED_GAP and compare the speed of the contexts, whenever the difference
 		btw them is greater than the max value the function returns true */
 	{
-		for(i = 0; i < nsched_ctxs; i++)
+		for(i = 0; i < ns; i++)
 		{
 			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 			double ctx_v = sc_hypervisor_get_ctx_speed(sc_w);
 			if(ctx_v != -1.0)
 			{
-				for(j = 0; j < nsched_ctxs; j++)
+				for(j = 0; j < ns; j++)
 				{
 					if(sched_ctxs[i] != sched_ctxs[j])
 					{

+ 49 - 28
sc_hypervisor/src/policies_utils/speed.c

@@ -74,10 +74,10 @@ double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, un
 /* /\* 			if(!worker_in_ctx) *\/ */
 /* /\* 			{ *\/ */
 
-/* /\* 				double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
+/* /\* 				double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); *\/ */
 /* /\* 				elapsed_time +=  (elapsed_data_used / transfer_speed) / 1000000 ; *\/ */
 /* /\* 			} *\/ */
-/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
+/* 			double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); */
 /* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
 /* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
 /* //			printf("elapsed time after %lf \n", elapsed_time); */
@@ -96,40 +96,61 @@ double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, un
 /* compute an average value of the cpu/cuda speed */
 double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch)
 {
-	struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
-        int worker;
-
-	struct starpu_sched_ctx_iterator it;
-	if(workers->init_iterator)
-                workers->init_iterator(workers, &it);
+	struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sc_w->sched_ctx);
 
-	double speed = 0.0;
-	unsigned nworkers = 0;
-        while(workers->has_next(workers, &it))
+	double ctx_elapsed_flops = sc_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
+	double ctx_sample = config->ispeed_ctx_sample;
+	if(ctx_elapsed_flops > ctx_sample)
 	{
-                worker = workers->get_next(workers, &it);
-                enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
-                if(arch == req_arch)
-                {
-			double _vel = sc_hypervisor_get_speed_per_worker(sc_w, worker);
-			if(_vel > 0.0)
+		struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
+		int worker;
+		
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+		
+		double speed = 0.0;
+		unsigned nworkers = 0;
+		double all_workers_flops = 0.0;
+		double all_workers_idle_time = 0.0;
+		while(workers->has_next(workers, &it))
+		{
+			worker = workers->get_next(workers, &it);
+			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
+			if(arch == req_arch)
 			{
-				speed += _vel;
+				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
+				all_workers_idle_time += sc_w->idle_time[worker]; /* in seconds */
 				nworkers++;
-
 			}
+		}			
+		
+		if(nworkers != 0)
+		{
+			double curr_time = starpu_timing_now();
+			
+			/* compute speed for the last frame */
+			double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
+			elapsed_time -= all_workers_idle_time;
+			speed = (all_workers_flops / elapsed_time) / nworkers;
 		}
-	}			
-
-	speed = ((nworkers != 0 && speed > 0.1) ? speed / nworkers : -1.0);
-	if(speed != -1.0)
-	{
-		if(arch == STARPU_CUDA_WORKER)
-			sc_w->ref_speed[0] = sc_w->ref_speed[0] > 1.0 ? (sc_w->ref_speed[0] + speed) / 2 : speed; 
 		else
-			sc_w->ref_speed[1] = sc_w->ref_speed[1] > 1.0 ? (sc_w->ref_speed[1] + speed) / 2 : speed; 
+			speed = -1.0;
+		
+		if(speed != -1.0)
+		{
+			/* if ref_speed started being corrupted bc of the old bad distribution
+			   register only the last frame otherwise make the average with the speed 
+			   behavior of the application until now */
+			if(arch == STARPU_CUDA_WORKER)
+				sc_w->ref_speed[0] = (sc_w->ref_speed[0] > 0.1) ? ((sc_w->ref_speed[0] + speed ) / 2.0) : speed; 
+			else
+				sc_w->ref_speed[1] = (sc_w->ref_speed[1] > 0.1) ? ((sc_w->ref_speed[1] + speed ) / 2.0) : speed; 
+		}
+		return speed;
 	}
-	return speed;
+
+	return -1.0;
 }
 
 /* compute an average value of the cpu/cuda old speed */

+ 40 - 27
sc_hypervisor/src/sc_hypervisor.c

@@ -291,7 +291,7 @@ void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
-static int _get_first_free_sched_ctx(int *sched_ctxs, int nsched_ctxs)
+static int _get_first_free_sched_ctx(unsigned *sched_ctxs, int nsched_ctxs)
 {
 	int i;
 	for(i = 0; i < nsched_ctxs; i++)
@@ -305,7 +305,7 @@ static int _get_first_free_sched_ctx(int *sched_ctxs, int nsched_ctxs)
    and have instead {5, 7, MAXVAL, MAXVAL, MAXVAL}
    it is easier afterwards to iterate the array
 */
-static void _rearange_sched_ctxs(int *sched_ctxs, int old_nsched_ctxs)
+static void _rearange_sched_ctxs(unsigned *sched_ctxs, int old_nsched_ctxs)
 {
 	int first_free_id = STARPU_NMAX_SCHED_CTXS;
 	int i;
@@ -332,7 +332,7 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 	unsigned i;
 	for(i = 0; i < hypervisor.nsched_ctxs; i++)
 	{
-		if(hypervisor.sched_ctxs[i] == (int)sched_ctx)
+		if(hypervisor.sched_ctxs[i] == sched_ctx)
 		{
 			hypervisor.sched_ctxs[i] = STARPU_NMAX_SCHED_CTXS;
 			break;
@@ -376,21 +376,6 @@ static int get_ntasks( int *tasks)
 	return ntasks;
 }
 
-
-static void _get_cpus(int *workers, int nworkers, int *cpus, int *ncpus)
-{
-	int i, worker;
-	*ncpus = 0;
-
-	for(i = 0; i < nworkers; i++)
-	{
-		worker = workers[i];
-		enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
-		if(arch == STARPU_CPU_WORKER)
-			cpus[(*ncpus)++] = worker;
-	}
-}
-
 int sc_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_worker_archtype arch)
 {
 	int nworkers_ctx = 0;
@@ -517,7 +502,7 @@ void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sch
 				}
 
 				hypervisor.resize[sender_sched_ctx] = 0;
-
+				if(imposed_resize)  imposed_resize = 0;
 				starpu_pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
 			}
 		}
@@ -604,6 +589,7 @@ void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigne
 				}
 
 				hypervisor.resize[sched_ctx] = 0;
+				if(imposed_resize)  imposed_resize = 0;
 				starpu_pthread_mutex_unlock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
 			}
 		}
@@ -627,7 +613,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 			struct sc_hypervisor_wrapper *sc_w = &hypervisor.sched_ctx_w[hypervisor.sched_ctxs[i]];
 			starpu_pthread_mutex_lock(&sc_w->mutex);
 			unsigned only_remove = 0;
-			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != (int)sched_ctx &&
+			if(sc_w->resize_ack.receiver_sched_ctx == -1 && hypervisor.sched_ctxs[i] != sched_ctx &&
 			   sc_w->resize_ack.nmoved_workers > 0 && starpu_sched_ctx_contains_worker(worker, hypervisor.sched_ctxs[i]))
 			{
 				int j;
@@ -734,7 +720,7 @@ void sc_hypervisor_post_resize_request(unsigned sched_ctx, int task_tag)
 	starpu_pthread_mutex_unlock(&hypervisor.resize_mut[sched_ctx]);
 }
 
-void sc_hypervisor_resize_ctxs(int *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
+void sc_hypervisor_resize_ctxs(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
 	if(hypervisor.policy.resize_ctxs)
 		hypervisor.policy.resize_ctxs(sched_ctxs, nsched_ctxs, workers, nworkers);
@@ -898,11 +884,11 @@ static void notify_delete_context(unsigned sched_ctx)
 	sc_hypervisor_unregister_ctx(sched_ctx);
 }
 
-void sc_hypervisor_size_ctxs(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	unsigned curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
-	int *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
+	int curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : nsched_ctxs;
+	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	unsigned s;
 	for(s = 0; s < curr_nsched_ctxs; s++)
@@ -917,7 +903,7 @@ struct sc_hypervisor_wrapper* sc_hypervisor_get_wrapper(unsigned sched_ctx)
 	return &hypervisor.sched_ctx_w[sched_ctx];
 }
 
-int* sc_hypervisor_get_sched_ctxs()
+unsigned* sc_hypervisor_get_sched_ctxs()
 {
 	return hypervisor.sched_ctxs;
 }
@@ -929,7 +915,7 @@ int sc_hypervisor_get_nsched_ctxs()
 	return ns;
 }
 
-void sc_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
+void sc_hypervisor_save_size_req(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	hypervisor.sr = (struct size_request*)malloc(sizeof(struct size_request));
 	hypervisor.sr->sched_ctxs = sched_ctxs;
@@ -938,7 +924,7 @@ void sc_hypervisor_save_size_req(int *sched_ctxs, int nsched_ctxs, int *workers,
 	hypervisor.sr->nworkers = nworkers;
 }
 
-unsigned sc_hypervisor_get_size_req(int **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
+unsigned sc_hypervisor_get_size_req(unsigned **sched_ctxs, int* nsched_ctxs, int **workers, int *nworkers)
 {
 	if(hypervisor.sr != NULL)
 	{
@@ -969,3 +955,30 @@ void _set_optimal_v(unsigned sched_ctx, double optimal_v)
 {
 	hypervisor.optimal_v[sched_ctx] = optimal_v;
 }
+
+static struct types_of_workers* _init_structure_types_of_workers(void)
+{
+	struct types_of_workers *tw = (struct types_of_workers*)malloc(sizeof(struct types_of_workers));
+        tw->ncpus = 0;
+	tw->ncuda = 0;
+        tw->nw = 0;
+        return tw;
+}
+
+struct types_of_workers* sc_hypervisor_get_types_of_workers(int *workers, unsigned nworkers)
+{
+	struct types_of_workers *tw = _init_structure_types_of_workers();
+
+        unsigned w;
+	for(w = 0; w < nworkers; w++)
+        {
+                enum starpu_worker_archtype arch = workers == NULL ? starpu_worker_get_type((int)w) : starpu_worker_get_type(workers[w]);
+                if(arch == STARPU_CPU_WORKER)
+			tw->ncpus++;
+                if(arch == STARPU_CUDA_WORKER)
+			tw->ncuda++;
+        }
+        if(tw->ncpus > 0) tw->nw++;
+        if(tw->ncuda > 0) tw->nw++;
+	return tw;
+}

+ 2 - 2
sc_hypervisor/src/sc_hypervisor_intern.h

@@ -23,7 +23,7 @@ struct size_request
 {
 	int *workers;
 	int nworkers;
-	int *sched_ctxs;
+	unsigned *sched_ctxs;
 	int nsched_ctxs;
 };
 
@@ -58,7 +58,7 @@ struct configuration_entry
 struct sc_hypervisor
 {
 	struct sc_hypervisor_wrapper sched_ctx_w[STARPU_NMAX_SCHED_CTXS];
-	int sched_ctxs[STARPU_NMAX_SCHED_CTXS];
+	unsigned sched_ctxs[STARPU_NMAX_SCHED_CTXS];
 	unsigned nsched_ctxs;
 	unsigned resize[STARPU_NMAX_SCHED_CTXS];
 	unsigned allow_remove[STARPU_NMAX_SCHED_CTXS];

+ 14 - 0
src/core/jobs.c

@@ -294,6 +294,20 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	_starpu_decrement_nready_tasks();
 
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
+
+	struct _starpu_worker *worker;
+	worker = _starpu_get_local_worker_key();
+	if (worker)
+	{
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+
+		if(worker->removed_from_ctx[sched_ctx] == 1 && worker->shares_tasks_lists[sched_ctx] == 1)
+		{
+			_starpu_worker_gets_out_of_ctx(sched_ctx, worker);
+			worker->removed_from_ctx[sched_ctx] = 0;
+		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+	}
 }
 
 /* This function is called when a new task is submitted to StarPU

+ 1 - 1
src/core/perfmodel/perfmodel.c

@@ -274,7 +274,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 		return 0.0;
 
 	unsigned src_node = _starpu_select_src_node(handle, memory_node);
-	return _starpu_predict_transfer_time(src_node, memory_node, size);
+	return starpu_transfer_predict(src_node, memory_node, size);
 }
 
 /* Data transfer performance modeling */

+ 1 - 5
src/core/perfmodel/perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -58,10 +58,6 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 void _starpu_create_sampling_directory_if_needed(void);
 
 void _starpu_load_bus_performance_files(void);
-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node);
-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
-
 
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);

+ 3 - 23
src/core/perfmodel/perfmodel_bus.c

@@ -1379,26 +1379,6 @@ static void write_bus_bandwidth_file_content(void)
 }
 #endif /* STARPU_SIMGRID */
 
-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
-{
-	return bandwidth_matrix[STARPU_MAIN_RAM][cudadev+1];
-}
-
-double starpu_get_latency_RAM_CUDA(unsigned cudadev)
-{
-	return latency_matrix[STARPU_MAIN_RAM][cudadev+1];
-}
-
-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
-{
-	return bandwidth_matrix[1][STARPU_MAIN_RAM];
-}
-
-double starpu_get_latency_CUDA_RAM(unsigned cudadev)
-{
-	return latency_matrix[1][STARPU_MAIN_RAM];
-}
-
 void starpu_bus_print_bandwidth(FILE *f)
 {
 	unsigned src, dst, maxnode;
@@ -1877,19 +1857,19 @@ void _starpu_load_bus_performance_files(void)
 }
 
 /* (in MB/s) */
-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
 {
 	return bandwidth_matrix[src_node][dst_node];
 }
 
 /* (in µs) */
-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node)
+double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
 {
 	return latency_matrix[src_node][dst_node];
 }
 
 /* (in µs) */
-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
 {
 	double bandwidth = bandwidth_matrix[src_node][dst_node];
 	double latency = latency_matrix[src_node][dst_node];

+ 15 - 0
src/core/sched_ctx.c

@@ -128,6 +128,21 @@ void starpu_sched_ctx_stop_task_submission()
 	_starpu_task_submit_internally(&stop_submission_task);
 }
 
+void starpu_sched_ctx_worker_shares_tasks_lists(int workerid, int sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
+	/* if is the initial sched_ctx no point in taking the mutex, the workers are
+	   not launched yet */
+	if(!sched_ctx->is_initial_sched)
+		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
+
+	worker->shares_tasks_lists[sched_ctx_id] = 1;
+
+	if(!sched_ctx->is_initial_sched)
+		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
+}
+
 static void _starpu_add_workers_to_sched_ctx(struct _starpu_sched_ctx *sched_ctx, int *workerids, int nworkers,
 				       int *added_workers, int *n_added_workers)
 {

+ 5 - 5
src/core/workers.c

@@ -468,7 +468,11 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 		int ctx;
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
+		{
 			workerarg->removed_from_ctx[ctx] = 0;
+			workerarg->shares_tasks_lists[ctx] = 0;
+		}
+
 
 		STARPU_PTHREAD_MUTEX_INIT(&workerarg->sched_mutex, NULL);
 		STARPU_PTHREAD_COND_INIT(&workerarg->sched_cond, NULL);
@@ -1038,11 +1042,6 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	return 0;
 }
 
-void starpu_profiling_init()
-{
-	_starpu_profiling_init();
-}
-
 /*
  * Handle runtime termination
  */
@@ -1243,6 +1242,7 @@ void starpu_shutdown(void)
 	_STARPU_DEBUG("Shutdown finished\n");
 }
 
+
 unsigned starpu_worker_get_count(void)
 {
 	return config.topology.nworkers;

+ 3 - 0
src/core/workers.h

@@ -105,6 +105,9 @@ struct _starpu_worker
 	   parallel sections to be executed on their allocated resources */
 	unsigned parallel_sect;
 
+	/* indicate whether the workers shares tasks lists with other workers*/
+	/* in this case when removing him from a context it disapears instantly */
+	unsigned shares_tasks_lists[STARPU_NMAX_SCHED_CTXS];
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */

+ 1 - 1
src/datawizard/coherency.c

@@ -62,7 +62,7 @@ unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destinati
 		{
 			if (src_node_mask & (1<<i))
 			{
-				double time = _starpu_predict_transfer_time(i, destination, size);
+				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 
 				/* Avoid indirect transfers */

+ 1 - 1
src/datawizard/memalloc.c

@@ -1043,7 +1043,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 			{
 				/* only time can change between disk <-> main_ram 
 				 * and not between main_ram <-> worker if we compare diks*/
-				double time_tmp = _starpu_predict_transfer_time(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
+				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
 				if (target == -1 || time_disk > time_tmp)
 				{
 					target = i;

+ 1 - 1
src/profiling/bound.c

@@ -585,7 +585,7 @@ void starpu_bound_print_lp(FILE *output)
 							/* The data transfer from w to w2 only happens if tasks run there */
 							fprintf(output, "d_t%luw%ut%luw%u >= %f - 2e5 + 1e5 t%luw%u + 1e5 t%luw%u;\n",
 									t1->deps[i].dep->id, w, t1->id, w2,
-									_starpu_predict_transfer_time(n, n2, t1->deps[i].size)/1000.,
+									starpu_transfer_predict(n, n2, t1->deps[i].size)/1000.,
 									t1->deps[i].dep->id, w, t1->id, w2);
 						}
 					}

+ 29 - 15
src/profiling/profiling.c

@@ -66,6 +66,29 @@ int _starpu_profiling =
 #endif
 	;
 
+void starpu_profiling_init()
+{
+	_starpu_profiling_init();
+}
+
+void _starpu_profiling_reset_counters()
+{
+	int worker;
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+	{
+		_starpu_worker_reset_profiling_info(worker);
+	}
+
+	int busid;
+	int bus_cnt = starpu_bus_get_count();
+	for (busid = 0; busid < bus_cnt; busid++)
+	{
+		struct starpu_profiling_bus_info *bus_info;
+		bus_info = busid_to_node_pair[busid].bus_info;
+		_starpu_bus_reset_profiling_info(bus_info);
+	}
+}
+
 int starpu_profiling_status_set(int status)
 {
 	ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);
@@ -78,19 +101,7 @@ int starpu_profiling_status_set(int status)
 	/* If we enable profiling, we reset the counters. */
 	if (status == STARPU_PROFILING_ENABLE)
 	{
-		int worker;
-		for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
-			_starpu_worker_reset_profiling_info(worker);
-
-		int busid;
-		int bus_cnt = starpu_bus_get_count();
-		for (busid = 0; busid < bus_cnt; busid++)
-		{
-			struct starpu_profiling_bus_info *bus_info;
-			bus_info = busid_to_node_pair[busid].bus_info;
-
-			_starpu_bus_reset_profiling_info(bus_info);
-		}
+		_starpu_profiling_reset_counters();
 	}
 
 	return prev_value;
@@ -98,13 +109,16 @@ int starpu_profiling_status_set(int status)
 
 void _starpu_profiling_init(void)
 {
-	int worker;
 	const char *env;
+	int worker;
+
 	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
 	{
 		STARPU_PTHREAD_MUTEX_INIT(&worker_info_mutex[worker], NULL);
-		_starpu_worker_reset_profiling_info(worker);
 	}
+
+	_starpu_profiling_reset_counters();
+
 	if ((env = getenv("STARPU_PROFILING")) && atoi(env))
 	{
 		ANNOTATE_HAPPENS_AFTER(&_starpu_profiling);

+ 14 - 2
src/sched_policies/eager_central_policy.c

@@ -130,17 +130,29 @@ static struct starpu_task *pop_task_eager_policy(unsigned sched_ctx_id)
 	VALGRIND_HG_MUTEX_UNLOCK_POST(&data->policy_mutex);
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	 task = _starpu_fifo_pop_task(data->fifo, workerid);
+	task = _starpu_fifo_pop_task(data->fifo, workerid);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 
 	return task;
 }
 
+static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+
+	int workerid;
+	unsigned i;
+	for (i = 0; i < nworkers; i++)
+	{
+		workerid = workerids[i];
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+	}
+}
+
 struct starpu_sched_policy _starpu_sched_eager_policy =
 {
 	.init_sched = initialize_eager_center_policy,
 	.deinit_sched = deinitialize_eager_center_policy,
-	.add_workers = NULL,
+	.add_workers = eager_add_workers,
 	.remove_workers = NULL,
 	.push_task = push_task_eager_policy,
 	.pop_task = pop_task_eager_policy,

+ 13 - 0
src/sched_policies/eager_central_priority_policy.c

@@ -239,8 +239,21 @@ static struct starpu_task *_starpu_priority_pop_task(unsigned sched_ctx_id)
 	return chosen_task;
 }
 
+static void eager_center_priority_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
+{
+
+        int workerid;
+	unsigned i;
+        for (i = 0; i < nworkers; i++)
+        {
+		workerid = workerids[i];
+                starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
+        }
+}
+
 struct starpu_sched_policy _starpu_sched_prio_policy =
 {
+	.add_workers = eager_center_priority_add_workers,
 	.init_sched = initialize_eager_center_priority_policy,
 	.deinit_sched = deinitialize_eager_center_priority_policy,
 	/* we always use priorities in that policy */

+ 3 - 3
src/sched_policies/parallel_eager.c

@@ -58,7 +58,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 	for(i = 0; i < nworkers; i++)
 	{
 		workerid = workerids[i];
-
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		int cnt = possible_combinations_cnt[workerid]++;
 		possible_combinations[workerid][cnt] = workerid;
 		possible_combinations_size[workerid][cnt] = 1;
@@ -177,8 +177,8 @@ static int push_task_peager_policy(struct starpu_task *task)
 		worker = workers->get_next(workers, &it);
 		int master = data->master_id[worker];
 		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-		if (!starpu_worker_is_combined_worker(worker) &&
-				starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
+		if ((!starpu_worker_is_combined_worker(worker) && starpu_worker_get_type(worker) != STARPU_CPU_WORKER)
+		    || (master == worker))
 		{
 			starpu_pthread_mutex_t *sched_mutex;
 			starpu_pthread_cond_t *sched_cond;

+ 1 - 0
src/sched_policies/work_stealing_policy.c

@@ -400,6 +400,7 @@ static void ws_add_workers(unsigned sched_ctx_id, int *workerids,unsigned nworke
 	for (i = 0; i < nworkers; i++)
 	{
 		workerid = workerids[i];
+		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		ws->queue_array[workerid] = _starpu_create_deque();
 		/**
 		 * The first WS_POP_TASK will increase NPROCESSED though no task was actually performed yet,