瀏覽代碼

Merge from trunk @11615:11640

Marc Sergent 12 年之前
父節點
當前提交
190b9bb01b
共有 61 個文件被更改,包括 651 次插入222 次删除
  1. 4 1
      configure.ac
  2. 23 22
      doc/doxygen/Makefile.am
  3. 228 0
      doc/doxygen/chapters/00introduction.doxy
  4. 0 0
      doc/doxygen/chapters/01building.doxy
  5. 8 12
      doc/doxygen/chapters/basic_examples.doxy
  6. 3 3
      doc/doxygen/chapters/advanced_examples.doxy
  7. 1 1
      doc/doxygen/chapters/optimize_performance.doxy
  8. 4 9
      doc/doxygen/chapters/performance_feedback.doxy
  9. 0 0
      doc/doxygen/chapters/06tips_and_tricks.doxy
  10. 2 2
      doc/doxygen/chapters/out_of_core.doxy
  11. 0 0
      doc/doxygen/chapters/08mpi_support.doxy
  12. 0 0
      doc/doxygen/chapters/09fft_support.doxy
  13. 0 0
      doc/doxygen/chapters/10mic_scc_support.doxy
  14. 4 6
      doc/doxygen/chapters/c_extensions.doxy
  15. 0 0
      doc/doxygen/chapters/12socl_opencl_extensions.doxy
  16. 0 0
      doc/doxygen/chapters/13scheduling_contexts.doxy
  17. 0 0
      doc/doxygen/chapters/14scheduling_context_hypervisor.doxy
  18. 3 1
      doc/doxygen/chapters/environment_variables.doxy
  19. 3 3
      doc/doxygen/chapters/configure_options.doxy
  20. 52 0
      doc/doxygen/chapters/17files.doxy
  21. 5 5
      doc/doxygen/chapters/scaling-vector-example.doxy
  22. 0 0
      doc/doxygen/chapters/19fdl-1.3.doxy
  23. 2 2
      doc/doxygen/chapters/code/cholesky_pragma.c
  24. 2 2
      doc/doxygen/chapters/code/complex.c
  25. 2 2
      doc/doxygen/chapters/code/disk_compute.c
  26. 2 2
      doc/doxygen/chapters/code/disk_copy.c
  27. 2 2
      doc/doxygen/chapters/code/forkmode.c
  28. 2 2
      doc/doxygen/chapters/code/hello_pragma.c
  29. 2 2
      doc/doxygen/chapters/code/hello_pragma2.c
  30. 2 2
      doc/doxygen/chapters/code/matmul_pragma.c
  31. 2 2
      doc/doxygen/chapters/code/matmul_pragma2.c
  32. 2 2
      doc/doxygen/chapters/code/multiformat.c
  33. 2 2
      doc/doxygen/chapters/code/scal_pragma.cu
  34. 2 2
      doc/doxygen/chapters/code/simgrid.c
  35. 2 2
      doc/doxygen/chapters/code/vector_scal_c.c
  36. 3 3
      doc/doxygen/chapters/code/vector_scal_cpu.c
  37. 2 3
      doc/doxygen/chapters/code/vector_scal_cuda.cu
  38. 2 2
      doc/doxygen/chapters/code/vector_scal_opencl.c
  39. 2 2
      doc/doxygen/chapters/code/vector_scal_opencl_codelet.cl
  40. 2 2
      doc/doxygen/doxygen.cfg
  41. 1 2
      doc/doxygen/doxygen_filter.sh.in
  42. 4 0
      include/starpu_sched_ctx.h
  43. 1 3
      include/starpu_sched_ctx_hypervisor.h
  44. 2 1
      mpi/src/starpu_mpi.c
  45. 4 6
      sc_hypervisor/include/sc_hypervisor_monitoring.h
  46. 15 13
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  47. 11 5
      sc_hypervisor/src/policies_utils/lp_tools.c
  48. 36 41
      sc_hypervisor/src/sc_hypervisor.c
  49. 22 2
      src/common/barrier_counter.c
  50. 3 0
      src/common/barrier_counter.h
  51. 15 0
      src/common/fxt.h
  52. 0 1
      src/core/jobs.c
  53. 69 15
      src/core/sched_ctx.c
  54. 2 6
      src/core/sched_ctx.h
  55. 39 19
      src/core/sched_policy.c
  56. 5 3
      src/core/task.c
  57. 4 0
      src/core/workers.c
  58. 8 0
      src/core/workers.h
  59. 0 1
      src/datawizard/interfaces/data_interface.c
  60. 30 0
      src/debug/traces/starpu_fxt.c
  61. 3 1
      src/debug/traces/starpu_paje.c

+ 4 - 1
configure.ac

@@ -2429,7 +2429,10 @@ AC_MSG_NOTICE([
 
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
-	--enable-maxopencldev, --enable-maxbuffers)
+	--enable-maxopencldev, --enable-maxmicdev, --enable-maxnodes,
+        --enable-maxbuffers)
+        (Note these numbers do not represent the number of detected
+	devices, but the maximum number of devices StarPU can manage)
 
 	Maximum number of CPUs:           $maxcpus
 	Maximum number of CUDA devices:   $nmaxcudadev

+ 23 - 22
doc/doxygen/Makefile.am

@@ -24,27 +24,26 @@ DOX_PDF = starpu.pdf
 DOX_TAG = starpu.tag
 
 chapters =	\
-	chapters/advanced_examples.doxy \
-	chapters/basic_examples.doxy \
-	chapters/building.doxy \
-	chapters/c_extensions.doxy \
-	chapters/fft_support.doxy \
-	chapters/introduction.doxy \
-	chapters/mpi_support.doxy \
-	chapters/optimize_performance.doxy \
-	chapters/performance_feedback.doxy \
-	chapters/scheduling_context_hypervisor.doxy \
-	chapters/scheduling_contexts.doxy \
-	chapters/modularized_scheduler.doxy \
-	chapters/out_of_core.doxy \
-	chapters/socl_opencl_extensions.doxy \
-	chapters/tips_and_tricks.doxy \
-	chapters/environment_variables.doxy \
-	chapters/configure_options.doxy \
-	chapters/fdl-1.3.doxy \
-	chapters/scaling-vector-example.doxy \
-	chapters/mic_scc_support.doxy \
-	chapters/files.doxy \
+	chapters/00introduction.doxy \
+	chapters/01building.doxy \
+	chapters/02basic_examples.doxy \
+	chapters/03advanced_examples.doxy \
+	chapters/04optimize_performance.doxy \
+	chapters/05performance_feedback.doxy \
+	chapters/06tips_and_tricks.doxy \
+	chapters/07out_of_core.doxy \
+	chapters/08mpi_support.doxy \
+	chapters/09fft_support.doxy \
+	chapters/10mic_scc_support.doxy \
+	chapters/11c_extensions.doxy \
+	chapters/12socl_opencl_extensions.doxy \
+	chapters/13scheduling_contexts.doxy \
+	chapters/14scheduling_context_hypervisor.doxy \
+	chapters/15environment_variables.doxy \
+	chapters/16configure_options.doxy \
+	chapters/17files.doxy \
+	chapters/18scaling-vector-example.doxy \
+	chapters/19fdl-1.3.doxy \
 	chapters/code/hello_pragma2.c \
 	chapters/code/hello_pragma.c \
 	chapters/code/scal_pragma.cu \
@@ -196,7 +195,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_profiling.h	\
 	$(top_srcdir)/include/starpu_bound.h		\
 	$(top_srcdir)/include/starpu_scheduler.h	\
-	$(top_srcdir)/include/starpu_sched_node.h	\
 	$(top_srcdir)/include/starpu_sched_ctx.h	\
 	$(top_srcdir)/include/starpu_sched_ctx_hypervisor.h		\
 	$(top_srcdir)/include/starpu_top.h		\
@@ -219,6 +217,9 @@ $(DOX_TAG): $(dox_inputs)
 	$(DOXYGEN) $(DOX_CONFIG)
 	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
 	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
+        # comment for the line above: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
+	sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js
+	sed -i 's/.*"Files.html".*//' html/pages.html
 
 dist_pdf_DATA = $(DOX_PDF)
 

+ 228 - 0
doc/doxygen/chapters/00introduction.doxy

@@ -0,0 +1,228 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+*/
+
+/*! \mainpage Introduction
+
+\htmlonly
+<h1><a class="anchor" id="Foreword"></a>Foreword</h1>
+\endhtmlonly
+\htmlinclude version.html
+\htmlinclude foreword.html
+
+\section Motivation Motivation
+
+// This is a comment and it will be removed before the file is processed by doxygen
+// complex machines with heterogeneous cores/devices
+
+The use of specialized hardware such as accelerators or coprocessors offers an
+interesting approach to overcome the physical limits encountered by processor
+architects. As a result, many machines are now equipped with one or several
+accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
+efforts have been devoted to offload computation onto such accelerators, very
+little attention as been paid to portability concerns on the one hand, and to the
+possibility of having heterogeneous accelerators and processors to interact on the other hand.
+
+StarPU is a runtime system that offers support for heterogeneous multicore
+architectures, it not only offers a unified view of the computational resources
+(i.e. CPUs and accelerators at the same time), but it also takes care of
+efficiently mapping and executing tasks onto an heterogeneous machine while
+transparently handling low-level issues such as data transfers in a portable
+fashion.
+
+// this leads to a complicated distributed memory design
+// which is not (easily) manageable by hand
+// added value/benefits of StarPU
+//    - portability
+//   - scheduling, perf. portability
+
+\section StarPUInANutshell StarPU in a Nutshell
+
+StarPU is a software tool aiming to allow programmers to exploit the
+computing power of the available CPUs and GPUs, while relieving them
+from the need to specially adapt their programs to the target machine
+and processing units.
+
+At the core of StarPU is its run-time support library, which is
+responsible for scheduling application-provided tasks on heterogeneous
+CPU/GPU machines.  In addition, StarPU comes with programming language
+support, in the form of extensions to languages of the C family
+(\ref cExtensions), as well as an OpenCL front-end (\ref SOCLOpenclExtensions).
+
+StarPU's run-time and programming language extensions support a
+task-based programming model. Applications submit computational
+tasks, with CPU and/or GPU implementations, and StarPU schedules these
+tasks and associated data transfers on available CPUs and GPUs.  The
+data that a task manipulates are automatically transferred among
+accelerators and the main memory, so that programmers are freed from the
+scheduling issues and technical details associated with these transfers.
+
+StarPU takes particular care of scheduling tasks efficiently, using
+well-known algorithms from the literature (\ref TaskSchedulingPolicy).
+In addition, it allows scheduling experts, such as compiler or
+computational library developers, to implement custom scheduling
+policies in a portable fashion (\ref DefiningANewSchedulingPolicy).
+
+The remainder of this section describes the main concepts used in StarPU.
+
+// explain the notion of codelet and task (i.e. g(A, B)
+
+\subsection CodeletAndTasks Codelet and Tasks
+
+One of the StarPU primary data structures is the \b codelet. A codelet describes a
+computational kernel that can possibly be implemented on multiple architectures
+such as a CPU, a CUDA device or an OpenCL device.
+
+// TODO insert illustration f: f_spu, f_cpu, ...
+
+Another important data structure is the \b task. Executing a StarPU task
+consists in applying a codelet on a data set, on one of the architectures on
+which the codelet is implemented. A task thus describes the codelet that it
+uses, but also which data are accessed, and how they are
+accessed during the computation (read and/or write).
+StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
+operation. The task structure can also specify a \b callback function that is
+called once StarPU has properly executed the task. It also contains optional
+fields that the application may use to give hints to the scheduler (such as
+priority levels).
+
+By default, task dependencies are inferred from data dependency (sequential
+coherence) by StarPU. The application can however disable sequential coherency
+for some data, and dependencies be expressed by hand.
+A task may be identified by a unique 64-bit number chosen by the application
+which we refer as a \b tag.
+Task dependencies can be enforced by hand either by the means of callback functions, by
+submitting other tasks, or by expressing dependencies
+between tags (which can thus correspond to tasks that have not been submitted
+yet).
+
+// TODO insert illustration f(Ar, Brw, Cr) + ..
+// DSM
+
+\subsection StarPUDataManagementLibrary StarPU Data Management Library
+
+Because StarPU schedules tasks at runtime, data transfers have to be
+done automatically and ``just-in-time'' between processing units,
+relieving the application programmer from explicit data transfers.
+Moreover, to avoid unnecessary transfers, StarPU keeps data
+where it was last needed, even if was modified there, and it
+allows multiple copies of the same data to reside at the same time on
+several processing units as long as it is not modified.
+
+\section ApplicationTaskification Application Taskification
+
+TODO
+
+// TODO: section describing what taskifying an application means: before
+// porting to StarPU, turn the program into:
+// "pure" functions, which only access data from their passed parameters
+// a main function which just calls these pure functions
+// and then it's trivial to use StarPU or any other kind of task-based library:
+// simply replace calling the function with submitting a task.
+
+\section Glossary Glossary
+
+A \b codelet records pointers to various implementations of the same
+theoretical function.
+
+A <b>memory node</b> can be either the main RAM, GPU-embedded memory or a disk memory.
+
+A \b bus is a link between memory nodes.
+
+A <b>data handle</b> keeps track of replicates of the same data (\b registered by the
+application) over various memory nodes. The data management library manages
+keeping them coherent.
+
+The \b home memory node of a data handle is the memory node from which the data
+was registered (usually the main memory node).
+
+A \b task represents a scheduled execution of a codelet on some data handles.
+
+A \b tag is a rendez-vous point. Tasks typically have their own tag, and can
+depend on other tags. The value is chosen by the application.
+
+A \b worker execute tasks. There is typically one per CPU computation core and
+one per accelerator (for which a whole CPU core is dedicated).
+
+A \b driver drives a given kind of workers. There are currently CPU, CUDA,
+and OpenCL drivers. They usually start several workers to actually drive
+them.
+
+A <b>performance model</b> is a (dynamic or static) model of the performance of a
+given codelet. Codelets can have execution time performance model as well as
+power consumption performance models.
+
+A data \b interface describes the layout of the data: for a vector, a pointer
+for the start, the number of elements and the size of elements ; for a matrix, a
+pointer for the start, the number of elements per row, the offset between rows,
+and the size of each element ; etc. To access their data, codelet functions are
+given interfaces for the local memory node replicates of the data handles of the
+scheduled task.
+
+\b Partitioning data means dividing the data of a given data handle (called
+\b father) into a series of \b children data handles which designate various
+portions of the former.
+
+A \b filter is the function which computes children data handles from a father
+data handle, and thus describes how the partitioning should be done (horizontal,
+vertical, etc.)
+
+\b Acquiring a data handle can be done from the main application, to safely
+access the data of a data handle from its home node, without having to
+unregister it.
+
+
+\section ResearchPapers Research Papers
+
+Research papers about StarPU can be found at
+http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
+
+A good overview is available in the research report at
+http://hal.archives-ouvertes.fr/inria-00467677.
+
+\section FurtherReading Further Reading
+
+The documentation chapters include
+
+<ol>
+<li> Part: Using StarPU
+<ul>
+<li> \ref BuildingAndInstallingStarPU
+<li> \ref BasicExamples
+<li> \ref AdvancedExamples
+<li> \ref HowToOptimizePerformanceWithStarPU
+<li> \ref PerformanceFeedback
+<li> \ref TipsAndTricksToKnowAbout
+<li> \ref OutOfCore
+<li> \ref MPISupport
+<li> \ref FFTSupport
+<li> \ref MICSCCSupport
+<li> \ref cExtensions
+<li> \ref SOCLOpenclExtensions
+<li> \ref SchedulingContexts
+<li> \ref SchedulingContextHypervisor
+</ul>
+</li>
+<li> Part: Inside StarPU
+<ul>
+<li> \ref ExecutionConfigurationThroughEnvironmentVariables
+<li> \ref CompilationConfiguration
+<li> \ref ModuleDocumentation
+<li> \ref FileDocumentation
+<li> \ref deprecated
+</ul>
+<li> Part: Appendix
+<ul>
+<li> \ref FullSourceCodeVectorScal
+<li> \ref GNUFreeDocumentationLicense
+</ul>
+</ol>
+
+
+Make sure to have had a look at those too!
+
+*/

doc/doxygen/chapters/building.doxy → doc/doxygen/chapters/01building.doxy


+ 8 - 12
doc/doxygen/chapters/basic_examples.doxy

@@ -22,7 +22,7 @@ implementations (for CPU, OpenCL, and/or CUDA), and invoke the task like
 a regular C function.  The example below defines <c>my_task</c> which
 has a single implementation for CPU:
 
-\snippet hello_pragma.c To be included
+\snippet hello_pragma.c To be included. You should update doxygen if you see that text.
 
 The code can then be compiled and linked with GCC and the flag <c>-fplugin</c>:
 
@@ -109,9 +109,7 @@ starpu_task_create(). This function only allocates and fills the
 corresponding structure with the default settings, but it does not
 submit the task to StarPU.
 
-\internal
-not really clear ;)
-\endinternal
+// not really clear ;)
 
 The field starpu_task::cl is a pointer to the codelet which the task will
 execute: in other words, the codelet structure describes which computational
@@ -198,9 +196,7 @@ not count the argument --- the parameter <c>cl_arg</c> of the function
 <c>cpu_func</c> --- since it is not managed by our data management
 library, but just contains trivial parameters.
 
-\internal
-TODO rewrite so that it is a little clearer ?
-\endinternal
+// TODO rewrite so that it is a little clearer ?
 
 Be aware that this may be a pointer to a
 \em copy of the actual buffer, and not the pointer given by the programmer:
@@ -352,7 +348,7 @@ vector_scal (unsigned size, float vector[size], float factor)
 Next, the body of the program, which uses the task defined above, can be
 implemented:
 
-\snippet hello_pragma2.c To be included
+\snippet hello_pragma2.c To be included. You should update doxygen if you see that text.
 
 The function <c>main</c> above does several things:
 
@@ -488,7 +484,7 @@ The actual implementation of the CUDA task goes into a separate
 compilation unit, in a <c>.cu</c> file.  It is very close to the
 implementation when using StarPU's standard C API (\ref DefinitionOfTheCUDAKernel).
 
-\snippet scal_pragma.cu To be included
+\snippet scal_pragma.cu To be included. You should update doxygen if you see that text.
 
 The complete source code, in the directory <c>gcc-plugin/examples/vector_scal</c>
 of the StarPU distribution, also shows how an SSE-specialized
@@ -628,7 +624,7 @@ that the vector pointer returned by ::STARPU_VECTOR_GET_PTR is here a
 pointer in GPU memory, so that it can be passed as such to the
 kernel call <c>vector_mult_cuda</c>.
 
-\snippet vector_scal_cuda.cu To be included
+\snippet vector_scal_cuda.cu To be included. You should update doxygen if you see that text.
 
 \subsection DefinitionOfTheOpenCLKernel Definition of the OpenCL Kernel
 
@@ -650,7 +646,7 @@ which returns a <c>cl_mem</c> (which is not a device pointer, but an OpenCL
 handle), which can be passed as such to the OpenCL kernel. The difference is
 important when using partitioning, see \ref PartitioningData.
 
-\snippet vector_scal_opencl.c To be included
+\snippet vector_scal_opencl.c To be included. You should update doxygen if you see that text.
 
 \subsection DefinitionOfTheMainCode Definition of the Main Code
 
@@ -661,7 +657,7 @@ starpu_codelet::cuda_funcs and starpu_codelet::opencl_funcs are set to
 define the pointers to the CUDA and OpenCL implementations of the
 task.
 
-\snippet vector_scal_c.c To be included
+\snippet vector_scal_c.c To be included. You should update doxygen if you see that text.
 
 \subsection ExecutionOfHybridVectorScaling Execution of Hybrid Vector Scaling
 

+ 3 - 3
doc/doxygen/chapters/advanced_examples.doxy

@@ -769,7 +769,7 @@ the CPU binding mask that StarPU chose.
 For instance, using OpenMP (full source is available in
 <c>examples/openmp/vector_scal.c</c>):
 
-\snippet forkmode.c To be included
+\snippet forkmode.c To be included. You should update doxygen if you see that text.
 
 Other examples include for instance calling a BLAS parallel CPU implementation
 (see <c>examples/mult/xgemm.c</c>).
@@ -891,7 +891,7 @@ will be able to convert data from one data structure to the other when needed.
 Note that the scheduler <c>dmda</c> is the only one optimized for this
 interface. The user must provide StarPU with conversion codelets:
 
-\snippet multiformat.c To be included
+\snippet multiformat.c To be included. You should update doxygen if you see that text.
 
 Kernels can be written almost as for any other interface. Note that
 ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
@@ -1115,7 +1115,7 @@ Similar functions need to be defined to access the different fields of the
 complex interface from a <c>void *</c> pointer to be used within codelet
 implemetations.
 
-\snippet complex.c To be included
+\snippet complex.c To be included. You should update doxygen if you see that text.
 
 Complex data interfaces can then be registered to StarPU.
 

+ 1 - 1
doc/doxygen/chapters/optimize_performance.doxy

@@ -547,6 +547,6 @@ case. Since during simgrid execution, the functions of the codelet are actually
 not called, one can use dummy functions such as the following to still permit
 CUDA or OpenCL execution:
 
-\snippet simgrid.c To be included
+\snippet simgrid.c To be included. You should update doxygen if you see that text.
 
 */

+ 4 - 9
doc/doxygen/chapters/performance_feedback.doxy

@@ -108,10 +108,8 @@ these values during the time, for the different workers.
 
 TODO: ajouter \ref STARPU_BUS_STATS
 
-\internal
-how to enable/disable performance monitoring
-what kind of information do we get ?
-\endinternal
+// how to enable/disable performance monitoring
+// what kind of information do we get ?
 
 The bus speed measured by StarPU can be displayed by using the tool
 <c>starpu_machine_display</c>, for instance:
@@ -608,10 +606,7 @@ task, and each color corresponds to a codelet.
 \image html data_trace.png
 \image latex data_trace.eps "" width=\textwidth
 
-
-\internal
-TODO: data transfer stats are similar to the ones displayed when
-setting STARPU_BUS_STATS
-\endinternal
+// TODO: data transfer stats are similar to the ones displayed when
+// setting STARPU_BUS_STATS
 
 */

doc/doxygen/chapters/tips_and_tricks.doxy → doc/doxygen/chapters/06tips_and_tricks.doxy


+ 2 - 2
doc/doxygen/chapters/out_of_core.doxy

@@ -47,10 +47,10 @@ All structures are in \ref API_Out_Of_Core .
 
 \section ExampleDiskCopy Examples: disk_copy
 
-\snippet disk_copy.c To be included
+\snippet disk_copy.c To be included. You should update doxygen if you see that text.
 
 \section ExampleDiskCompute Examples: disk_compute
 
-\snippet disk_compute.c To be included
+\snippet disk_compute.c To be included. You should update doxygen if you see that text.
 
 */

doc/doxygen/chapters/mpi_support.doxy → doc/doxygen/chapters/08mpi_support.doxy


doc/doxygen/chapters/fft_support.doxy → doc/doxygen/chapters/09fft_support.doxy


doc/doxygen/chapters/mic_scc_support.doxy → doc/doxygen/chapters/10mic_scc_support.doxy


+ 4 - 6
doc/doxygen/chapters/c_extensions.doxy

@@ -107,9 +107,7 @@ as a type qualifier for output pointer or array parameters
 Declare the given function as an implementation of <c>task</c> to run on
 <c>target</c>.  <c>target</c> must be a string, currently one of
 <c>"cpu"</c>, <c>"opencl"</c>, or <c>"cuda"</c>.
-\internal
-FIXME: Update when OpenCL support is ready.
-\endinternal
+// FIXME: Update when OpenCL support is ready.
 </dd>
 </dl>
 
@@ -322,7 +320,7 @@ automatic variables.
 The following example illustrates use of the <c>heap_allocated</c>
 attribute:
 
-\snippet cholesky_pragma.c To be included
+\snippet cholesky_pragma.c To be included. You should update doxygen if you see that text.
 
 \section UsingCExtensionsConditionally Using C Extensions Conditionally
 
@@ -340,7 +338,7 @@ extensions.
 The code below illustrates how to define a task and its implementations
 in a way that allows it to be compiled without the GCC plug-in:
 
-\snippet matmul_pragma.c To be included
+\snippet matmul_pragma.c To be included. You should update doxygen if you see that text.
 
 The above program is a valid StarPU program when StarPU's GCC plug-in is
 used; it is also a valid sequential program when the plug-in is not
@@ -354,7 +352,7 @@ unable to parse the attribute syntax (In practice, Clang and
 several proprietary compilers implement attributes.), so you may want to
 wrap attributes in macros like this:
 
-\snippet matmul_pragma2.c To be included
+\snippet matmul_pragma2.c To be included. You should update doxygen if you see that text.
 
 */
 

doc/doxygen/chapters/socl_opencl_extensions.doxy → doc/doxygen/chapters/12socl_opencl_extensions.doxy


doc/doxygen/chapters/scheduling_contexts.doxy → doc/doxygen/chapters/13scheduling_contexts.doxy


doc/doxygen/chapters/scheduling_context_hypervisor.doxy → doc/doxygen/chapters/14scheduling_context_hypervisor.doxy


+ 3 - 1
doc/doxygen/chapters/environment_variables.doxy

@@ -629,7 +629,7 @@ By default the hypervisor resizes the contexts in a lazy way, that is workers ar
 before removing them from the previous one. Once this workers are clearly taken into account 
 into the new context (a task was poped there) we remove them from the previous one. However if the application
 would like that the change in the distribution of workers should change right away this variable should be set to 0
-</dl>
+</dd>
 
 <dt>SC_HYPERVISOR_SAMPLE_CRITERIA</dt>
 <dd>
@@ -638,6 +638,8 @@ would like that the change in the distribution of workers should change right aw
 By default the hypervisor uses a sample of flops when computing the speed of the contexts and of the workers.
 If this variable is set to <c>time</c> the hypervisor uses a sample of time (10% of an aproximation of the total
 execution time of the application)
+</dd>
+
 </dl>
 
 */

+ 3 - 3
doc/doxygen/chapters/configure_options.doxy

@@ -352,6 +352,8 @@ Specify the precise MIC architecture host identifier.
 The default value is <c>x86_64-k1om-linux</c>
 </dd>
 
+</dl>
+
 \section AdvancedConfiguration Advanced Configuration
 
 <dl>
@@ -406,9 +408,7 @@ CUDA. Still experimental.
 \anchor enable-opengl-render
 \addindex __configure__--enable-opengl-render
 Enable the use of OpenGL for the rendering of some examples.
-\internal
-TODO: rather default to enabled when detected
-\endinternal
+// TODO: rather default to enabled when detected
 </dd>
 
 <dt>--enable-blas-lib</dt>

+ 52 - 0
doc/doxygen/chapters/17files.doxy

@@ -0,0 +1,52 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+*/
+
+/*! \page Files Files
+
+\file starpu_deprecated_api.h
+\file starpu.h
+\file starpu_data_filters.h
+\file starpu_data_interfaces.h
+\file starpu_disk.h
+\file starpu_worker.h
+\file starpu_task.h
+\file starpu_task_bundle.h
+\file starpu_task_list.h
+\file starpu_task_util.h
+\file starpu_data.h
+\file starpu_perfmodel.h
+\file starpu_util.h
+\file starpu_fxt.h
+\file starpu_cuda.h
+\file starpu_opencl.h
+\file starpu_sink.h
+\file starpu_mic.h
+\file starpu_scc.h
+\file starpu_expert.h
+\file starpu_profiling.h
+\file starpu_bound.h
+\file starpu_scheduler.h
+\file starpu_sched_ctx.h
+\file starpu_sched_ctx_hypervisor.h
+\file starpu_top.h
+\file starpu_hash.h
+\file starpu_rand.h
+\file starpu_cublas.h
+\file starpu_driver.h
+\file starpu_stdlib.h
+\file starpu_thread.h
+\file starpu_thread_util.h
+\file starpu_mpi.h
+\file sc_hypervisor.h
+\file sc_hypervisor_config.h
+\file sc_hypervisor_lp.h
+\file sc_hypervisor_monitoring.h
+\file sc_hypervisor_policy.h
+\file starpu_config.h
+
+*/

+ 5 - 5
doc/doxygen/chapters/scaling-vector-example.doxy

@@ -10,25 +10,25 @@
 
 \section MainApplication Main Application
 
-\snippet vector_scal_c.c To be included
+\snippet vector_scal_c.c To be included. You should update doxygen if you see that text.
 
 \section CPUKernel CPU Kernel
 
-\snippet vector_scal_cpu.c To be included
+\snippet vector_scal_cpu.c To be included. You should update doxygen if you see that text.
 
 \section CUDAKernel CUDA Kernel
 
-\snippet vector_scal_cuda.cu To be included
+\snippet vector_scal_cuda.cu To be included. You should update doxygen if you see that text.
 
 \section OpenCLKernel OpenCL Kernel
 
 \subsection InvokingtheKernel Invoking the Kernel
 
-\snippet vector_scal_opencl.c To be included
+\snippet vector_scal_opencl.c To be included. You should update doxygen if you see that text.
 
 \subsection SourceoftheKernel Source of the Kernel
 
-\snippet vector_scal_opencl_codelet.cl To be included
+\snippet vector_scal_opencl_codelet.cl To be included. You should update doxygen if you see that text.
 
 */
 

doc/doxygen/chapters/fdl-1.3.doxy → doc/doxygen/chapters/19fdl-1.3.doxy


+ 2 - 2
doc/doxygen/chapters/code/cholesky_pragma.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 extern void cholesky(unsigned nblocks, unsigned size,
                     float mat[nblocks][nblocks][size])
   __attribute__ ((task));
@@ -47,4 +47,4 @@ main (int argc, char *argv[])
 
   return EXIT_SUCCESS;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/complex.c

@@ -15,11 +15,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 #define STARPU_COMPLEX_GET_REAL(interface)	\
         (((struct starpu_complex_interface *)(interface))->real)
 #define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
         (((struct starpu_complex_interface *)(interface))->imaginary)
 #define STARPU_COMPLEX_GET_NX(interface)	\
         (((struct starpu_complex_interface *)(interface))->nx)
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/disk_compute.c

@@ -13,7 +13,7 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 /* Try to write into disk memory
  * Use mechanism to push datas from main ram to disk ram
  */
@@ -175,5 +175,5 @@ enodev:
 enoent:
 	return 77;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 

+ 2 - 2
doc/doxygen/chapters/code/disk_copy.c

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 
 /* Try to write into disk memory
  * Use mechanism to push datas from main ram to disk ram
@@ -119,4 +119,4 @@ enoent:
 	return 77;
 }
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/forkmode.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 void scal_cpu_func(void *buffers[], void *_args)
 {
     unsigned i;
@@ -39,4 +39,4 @@ static struct starpu_codelet cl =
     .cpu_funcs_name = {"scal_cpu_func", NULL},
     .nbuffers = 1,
 };
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/hello_pragma.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 #include <stdio.h>
 
 /* Task declaration.  */
@@ -43,4 +43,4 @@ int main ()
 
   return 0;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/hello_pragma2.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 int main (void)
 {
 #pragma starpu initialize
@@ -40,4 +40,4 @@ int main (void)
 
   return valid ? EXIT_SUCCESS : EXIT_FAILURE;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/matmul_pragma.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 /* This program is valid, whether or not StarPU's GCC plug-in
    is being used.  */
 
@@ -70,4 +70,4 @@ main (int argc, char *argv[])
 
   return EXIT_SUCCESS;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/matmul_pragma2.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 /* Use the `task' attribute only when StarPU's GCC plug-in
    is available.   */
 #ifdef STARPU_GCC_PLUGIN
@@ -26,4 +26,4 @@
 
 static void matmul (const float *A, const float *B, float *C,
                     unsigned nx, unsigned ny, unsigned nz) __task;
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/multiformat.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 #define NX 1024
 struct point array_of_structs[NX];
 starpu_data_handle_t handle;
@@ -58,4 +58,4 @@ struct starpu_multiformat_data_interface_ops format_ops = {
 };
 
 starpu_multiformat_data_register(handle, STARPU_MAIN_RAM, &array_of_structs, NX, &format_ops);
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/scal_pragma.cu

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 /* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
 
 #include <starpu.h>
@@ -42,4 +42,4 @@ vector_scal_cuda (size_t size, float vector[], float factor)
 
   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/simgrid.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 static struct starpu_codelet cl11 =
 {
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
@@ -29,4 +29,4 @@ static struct starpu_codelet cl11 =
 	.modes = {STARPU_RW},
 	.model = &chol_model_11
 };
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/vector_scal_c.c

@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 /*
  * This example demonstrates how to use StarPU to scale an array by a factor.
  * It shows how to manipulate data with StarPU's data management library.
@@ -125,4 +125,4 @@ int main(int argc, char **argv)
 
     return 0;
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 3 - 3
doc/doxygen/chapters/code/vector_scal_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 
 #include <starpu.h>
 #include <xmmintrin.h>
@@ -75,4 +75,4 @@ void scal_sse_func(void *buffers[], void *cl_arg)
         }
     }
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 3
doc/doxygen/chapters/code/vector_scal_cuda.cu

@@ -14,7 +14,7 @@
  *
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 #include <starpu.h>
 
 static __global__ void vector_mult_cuda(unsigned n, float *val,
@@ -41,5 +41,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
-//! [To be included]
-
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -16,7 +16,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 #include <starpu.h>
 
 extern struct starpu_opencl_program programs;
@@ -75,4 +75,4 @@ void scal_opencl_func(void *buffers[], void *_args)
 	 starpu_opencl_release_kernel(kernel);
     }
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/chapters/code/vector_scal_opencl_codelet.cl

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]
 __kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
@@ -22,4 +22,4 @@ __kernel void vector_mult_opencl(int nx, __global float* val, float factor)
                 val[i] *= factor;
         }
 }
-//! [To be included]
+//! [To be included. You should update doxygen if you see that text.]

+ 2 - 2
doc/doxygen/doxygen.cfg

@@ -1003,7 +1003,7 @@ HTML_TIMESTAMP         = YES
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
 
-HTML_DYNAMIC_SECTIONS  = NO
+HTML_DYNAMIC_SECTIONS  = YES
 
 # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
 # entries shown in the various tree structured indices initially; the user
@@ -1185,7 +1185,7 @@ DISABLE_INDEX          = NO
 # Since the tree basically has the same information as the tab index you
 # could consider to set DISABLE_INDEX to NO when enabling this option.
 
-GENERATE_TREEVIEW      = NO
+GENERATE_TREEVIEW      = YES
 
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
 # (range [0,1..20]) that doxygen will group on one line in the generated HTML

+ 1 - 2
doc/doxygen/doxygen_filter.sh.in

@@ -3,7 +3,6 @@
 if [ "$(basename $1)" == "starpufft.h" ] ; then
     gcc -E $1 -I @top_srcdir@/include/ -I @top_builddir@/include/ |grep starpufft
 else
-    SUFFIX_C=$(basename $1 .c)
-    sed -e 's/STARPU_DEPRECATED//' $1
+    sed -e 's/STARPU_DEPRECATED//' $1 | sed 's/\/\/.*//'
 fi
 

+ 4 - 0
include/starpu_sched_ctx.h

@@ -102,6 +102,10 @@ void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
 
 void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
+int starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id);
+
+double starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SC_HYPERVISOR

+ 1 - 3
include/starpu_sched_ctx_hypervisor.h

@@ -29,10 +29,8 @@ struct starpu_sched_ctx_performance_counters
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_poped_task)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag,
-				      int nready_tasks, double nready_flops);
+	void (*notify_post_exec_task)(struct starpu_task *task, size_t data_size, uint32_t footprint, int hypervisor_tag, double flops);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint, size_t data_size);
-	void (*notify_ready_task)(unsigned sched_ctx_id, struct starpu_task *task);
 	void (*notify_empty_ctx)(unsigned sched_ctx_id, struct starpu_task *task);
 	void (*notify_delete_context)(unsigned sched_ctx);
 };

+ 2 - 1
mpi/src/starpu_mpi.c

@@ -1473,7 +1473,8 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 			struct _starpu_mpi_copy_handle_hashlist *current, *tmp;
 			HASH_ITER(hh, _starpu_mpi_copy_handle_hashmap[n], current, tmp)
 			{
-			     HASH_DEL(_starpu_mpi_copy_handle_hashmap[n], current);
+				HASH_DEL(_starpu_mpi_copy_handle_hashmap[n], current);
+				free(current);
 			}
 		}
 	}

+ 4 - 6
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -100,14 +100,15 @@ struct sc_hypervisor_wrapper
 	/* number of flops that still have to be executed in this ctx */
 	double remaining_flops;
 	
-	/* number of flops coresponding to the ready tasks in this ctx */
-	double ready_flops;
-
 	/* the start time of the resizing sample of this context*/
 	double start_time;
 
 	/* the first time a task was pushed to this context*/
 	double real_start_time;
+	
+	/* the start time for sample in which the hyp is not allowed to react
+	   bc too expensive */
+	double hyp_react_start_time;
 
 	/* the workers don't leave the current ctx until the receiver ctx 
 	   doesn't ack the receive of these workers */
@@ -120,9 +121,6 @@ struct sc_hypervisor_wrapper
 	   flops of all the execution or not */
 	unsigned total_flops_available;
 
-	/* the number of ready tasks submitted to a ctx */
-	int nready_tasks;
-
 	/* boolean indicating that a context is being sized */
 	unsigned to_be_sized;
 

+ 15 - 13
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -1,3 +1,4 @@
+
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011 - 2013  INRIA
@@ -24,8 +25,8 @@ int resize_no = 0;
 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
 	/* for vite */
-/* 	printf("resize_no = %d\n", resize_no); */
-/* 	starpu_trace_user_event(resize_no++); */
+	printf("resize_no = %d\n", resize_no);
+	starpu_trace_user_event(resize_no++);
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
 	unsigned curr_nworkers = nworkers == -1 ? starpu_worker_get_count() : (unsigned)nworkers;
@@ -63,19 +64,20 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 static void feft_lp_handle_poped_task(__attribute__((unused))unsigned sched_ctx, __attribute__((unused))int worker, 
 				      __attribute__((unused))struct starpu_task *task, __attribute__((unused))uint32_t footprint)
 {
-	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
-	if(ret != EBUSY)
+	unsigned criteria = sc_hypervisor_get_resize_criteria();
+	if(criteria != SC_NOTHING && criteria == SC_SPEED)
 	{
-		unsigned criteria = sc_hypervisor_get_resize_criteria();
-		if(criteria != SC_NOTHING && criteria == SC_SPEED)
+
+		int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+		if(ret != EBUSY)
 		{
 			if(sc_hypervisor_check_speed_gap_btw_ctxs())
 			{
 				_try_resizing(NULL, -1, NULL, -1);
 			}
-		}
 	
-		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+		}
 	}
 
 }
@@ -151,16 +153,16 @@ static void feft_lp_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *worker
 
 static void feft_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
 {
-	int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
-	if(ret != EBUSY)
+	unsigned criteria = sc_hypervisor_get_resize_criteria();
+	if(criteria != SC_NOTHING && criteria == SC_IDLE)
 	{
-		unsigned criteria = sc_hypervisor_get_resize_criteria();
-		if(criteria != SC_NOTHING && criteria == SC_IDLE)
+		int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
+		if(ret != EBUSY)
 		{
 			if(sc_hypervisor_check_idle(sched_ctx, worker))
 				_try_resizing(NULL, -1, NULL, -1);
+			starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 		}
-		starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	}
 }
 

+ 11 - 5
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -40,6 +40,9 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 		int w;
 		for(w = 0; w < nw; w++)
 			v[i][w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
+
+		double ready_flops = starpu_get_nready_flops_of_sched_ctx(sc_w->sched_ctx);
+		int nready_tasks = starpu_get_nready_tasks_of_sched_ctx(sc_w->sched_ctx);
 		
 		if(sc_w->to_be_sized)
 		{
@@ -49,17 +52,19 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 		else
 		{
 			if(sc_w->remaining_flops < 0.0)
-				flops[i] = sc_w->ready_flops/1000000000.0; /* in gflops*/
+				flops[i] = ready_flops/1000000000.0; /* in gflops*/
 			else
 			{
-				if((sc_w->ready_flops/1000000000.0) <= 0.000002)
+				if((ready_flops/1000000000.0) <= 0.000002)
 					flops[i] = 0.0;
 				else
 					flops[i] = sc_w->remaining_flops/1000000000.0; /* in gflops*/
 			}
 		}
-/* 		printf("%d: flops %lf remaining flops %lf ready flops %lf nready_tasks %d\n", */
-/* 		       sched_ctxs[i], flops[i], sc_w->remaining_flops/1000000000, sc_w->ready_flops/1000000000, sc_w->nready_tasks); */
+		if(flops[i] < 0.0)
+			flops[i] = 0.0;
+		printf("%d: flops %lf remaining flops %lf ready flops %lf nready_tasks %d\n",
+		       sched_ctxs[i], flops[i], sc_w->remaining_flops/1000000000, ready_flops/1000000000, nready_tasks);
 
 	}
 
@@ -108,6 +113,7 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 	}
 
 	double vmax = 0.0;
+
 	if(ret != 0.0)
 	{
 		/* redo the lp after cleaning out the contexts that got all the max workers required */
@@ -591,7 +597,7 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns,
 				{
 					nworkers_to_add=1;
 					int old_start = start[w];
-					if(start[w] == nworkers)
+					if(start[w] != 0)
 						start[w]--;
 					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
 					start[w] = old_start;

+ 36 - 41
sc_hypervisor/src/sc_hypervisor.c

@@ -25,10 +25,9 @@ struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
 static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, 
-				  int hypervisor_tag, int nready_tasks, double ready_flops);
+				  int hypervisor_tag, double flops);
 static void notify_poped_task(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint, size_t data_size);
-static void notify_ready_task(unsigned sched_ctx, struct starpu_task *task);
 static void notify_empty_ctx(unsigned sched_ctx, struct starpu_task *task);
 static void notify_delete_context(unsigned sched_ctx);
 
@@ -181,6 +180,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 		hypervisor.sched_ctx_w[i].remaining_flops = 0.0;
 		hypervisor.sched_ctx_w[i].start_time = 0.0;
 		hypervisor.sched_ctx_w[i].real_start_time = 0.0;
+		hypervisor.sched_ctx_w[i].hyp_react_start_time = 0.0;
 		hypervisor.sched_ctx_w[i].resize_ack.receiver_sched_ctx = -1;
 		hypervisor.sched_ctx_w[i].resize_ack.moved_workers = NULL;
 		hypervisor.sched_ctx_w[i].resize_ack.nmoved_workers = 0;
@@ -190,9 +190,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 
 		hypervisor.sched_ctx_w[i].ref_speed[0] = -1.0;
 		hypervisor.sched_ctx_w[i].ref_speed[1] = -1.0;
-		hypervisor.sched_ctx_w[i].ready_flops = 0.0;
 		hypervisor.sched_ctx_w[i].total_flops_available = 0;
-		hypervisor.sched_ctx_w[i].nready_tasks = 0;
 		hypervisor.sched_ctx_w[i].to_be_sized = 0;
 		int j;
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
@@ -223,7 +221,6 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 	perf_counters->notify_poped_task = notify_poped_task;
 	perf_counters->notify_post_exec_task = notify_post_exec_task;
 	perf_counters->notify_submitted_job = notify_submitted_job;
-	perf_counters->notify_ready_task = notify_ready_task;
 	perf_counters->notify_empty_ctx = notify_empty_ctx;
 	perf_counters->notify_delete_context = notify_delete_context;
 
@@ -316,6 +313,7 @@ void sc_hypervisor_register_ctx(unsigned sched_ctx, double total_flops)
 	hypervisor.sched_ctx_w[sched_ctx].total_flops = total_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops = total_flops;
 	hypervisor.resize[sched_ctx] = 1;
+	hypervisor.sched_ctx_w[sched_ctx].hyp_react_start_time = starpu_timing_now();
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
@@ -861,35 +859,30 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 			if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] == 0.0)
 			{
 				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker];
-//				printf("%d/%d: exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker]);
 			}
 			else
 			{
 				double current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
 				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] + current_exec_time;
-//				printf("%d/%d: exec_time %lf current_exec_time %lf\n", worker, sched_ctx, hypervisor.sched_ctx_w[sched_ctx].exec_time[worker], current_exec_time);
 			}		
 			norm_exec_time += elapsed_time_worker[worker] == 0.0 ? 0.0 : exec_time / elapsed_time_worker[worker];
 		}			
 
 		double curr_time = starpu_timing_now();
 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
-//		double norm_idle_time = max_workers_idle_time[i] / elapsed_time;
-//		double norm_exec_time = exec_time / elapsed_time;
+		int nready_tasks = starpu_get_nready_tasks_of_sched_ctx(sched_ctx);
 		if(norm_idle_time >= 0.9)
 		{
-//			config->max_nworkers = 	workers->nworkers - lrint(norm_idle_time);
 			config->max_nworkers = lrint(norm_exec_time);
-/* 			if(config->max_nworkers > hypervisor.sched_ctx_w[sched_ctx].nready_tasks) */
-/* 				config->max_nworkers = hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; */
 		}
 		else
 		{
-			if(norm_idle_time < 0.1)//(max_workers_idle_time[i] < 0.000001)
-				config->max_nworkers = lrint(norm_exec_time)  + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
+			if(norm_idle_time < 0.1)
+				config->max_nworkers = lrint(norm_exec_time)  + nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
 			else
 				config->max_nworkers = lrint(norm_exec_time);
 		}
+//		config->max_nworkers = hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
 		
 		if(config->max_nworkers < 0)
 			config->max_nworkers = 0;
@@ -897,7 +890,7 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 			config->max_nworkers = max_cpus;
 		
 		printf("%d: ready tasks  %d idle for long %lf norm_idle_time %lf elapsed_time %lf norm_exec_time %lf nworker %d max %d \n", 
-		       sched_ctx, hypervisor.sched_ctx_w[sched_ctx].nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, norm_exec_time, workers->nworkers, config->max_nworkers);
+		       sched_ctx, nready_tasks, max_workers_idle_time[i], norm_idle_time, elapsed_time, norm_exec_time, workers->nworkers, config->max_nworkers);
 
 
 		total_max_nworkers += config->max_nworkers;
@@ -913,9 +906,10 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 		unsigned max_nready_sched_ctx = sched_ctxs[0];
 		for(i = 0; i < nsched_ctxs; i++)
 		{
-			if(max_nready < hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks)
+			int nready_tasks = starpu_get_nready_tasks_of_sched_ctx(sched_ctxs[i]);
+			if(max_nready < nready_tasks)
 			{
-				max_nready = hypervisor.sched_ctx_w[sched_ctxs[i]].nready_tasks;
+				max_nready = nready_tasks;
 				max_nready_sched_ctx = sched_ctxs[i];
 			}
 		}
@@ -964,7 +958,13 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 		
 		if(hypervisor.policy.handle_idle_cycle)
 		{
-			hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
+			double curr_time = starpu_timing_now();
+			double elapsed_time = (curr_time - sc_w->hyp_react_start_time) / 1000000.0; /* in seconds */
+			if(sc_w->sched_ctx != STARPU_NMAX_SCHED_CTXS && elapsed_time > sc_w->config->time_sample)
+			{
+				sc_w->hyp_react_start_time = starpu_timing_now();
+				hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
+			}
 		}
 	}
 	return;
@@ -986,7 +986,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker)
 	if(sc_w->idle_start_time[worker] > 0.0)
 	{
 		double end_time  = starpu_timing_now();
-		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */ 
+		sc_w->idle_time[worker] += (end_time - sc_w->idle_start_time[worker]) / 1000000.0; /* in seconds */
 		sc_w->idle_start_time[worker] = 0.0;
 	}
 			
@@ -997,7 +997,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker)
 
  
 /* notifies the hypervisor that a tagged task has just been executed */
-static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag, int ready_tasks, double ready_flops)
+static void notify_post_exec_task(struct starpu_task *task, size_t data_size, uint32_t footprint, int task_tag, double flops)
 {
 	unsigned sched_ctx = task->sched_ctx;
 	int worker = starpu_worker_get_id();
@@ -1011,25 +1011,30 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 	}
 
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
-	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += task->flops;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += flops;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
-	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += task->flops;
+	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += flops;
 
 	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= task->flops;
-	hypervisor.sched_ctx_w[sched_ctx].nready_tasks = ready_tasks;
-	hypervisor.sched_ctx_w[sched_ctx].ready_flops = ready_flops;
-	if(hypervisor.sched_ctx_w[sched_ctx].ready_flops < 0.0)
-		hypervisor.sched_ctx_w[sched_ctx].ready_flops = 0.0;
-	_ack_resize_completed(sched_ctx, worker);
+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= flops;
+	if(_sc_hypervisor_use_lazy_resize())
+		_ack_resize_completed(sched_ctx, worker);
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
 	
 	if(hypervisor.resize[sched_ctx])
 	{	
 		if(hypervisor.policy.handle_poped_task)
-			hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
+		{
+			double curr_time = starpu_timing_now();
+			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].hyp_react_start_time) / 1000000.0; /* in seconds */
+			if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS && elapsed_time > hypervisor.sched_ctx_w[sched_ctx].config->time_sample)
+			{
+				hypervisor.sched_ctx_w[sched_ctx].hyp_react_start_time = starpu_timing_now();
+				hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
+			}
+		}
 	}
 /* 	starpu_pthread_mutex_lock(&act_hypervisor_mutex); */
 /* 	_ack_resize_completed(sched_ctx, worker); */
@@ -1042,9 +1047,7 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 	
 	unsigned conf_sched_ctx;
 	unsigned i;
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	unsigned ns = hypervisor.nsched_ctxs;
-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 
 	for(i = 0; i < ns; i++)
 	{
@@ -1098,14 +1101,6 @@ static void notify_submitted_job(struct starpu_task *task, uint32_t footprint, s
 		hypervisor.policy.handle_submitted_job(task->cl, task->sched_ctx, footprint, data_size);
 }
 
-static void notify_ready_task(unsigned sched_ctx_id, struct starpu_task *task)
-{
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
-	hypervisor.sched_ctx_w[sched_ctx_id].nready_tasks++;
-	hypervisor.sched_ctx_w[sched_ctx_id].ready_flops += task->flops;
-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
-}
-
 static void notify_empty_ctx(unsigned sched_ctx_id, struct starpu_task *task)
 {
 	sc_hypervisor_resize_ctxs(NULL, -1 , NULL, -1);
@@ -1126,10 +1121,10 @@ static void notify_delete_context(unsigned sched_ctx)
 
 void sc_hypervisor_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
+//	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	unsigned curr_nsched_ctxs = sched_ctxs == NULL ? hypervisor.nsched_ctxs : (unsigned)nsched_ctxs;
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? hypervisor.sched_ctxs : sched_ctxs;
-	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
+//	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 	unsigned s;
 	for(s = 0; s < curr_nsched_ctxs; s++)
 		hypervisor.resize[curr_sched_ctxs[s]] = 1;

+ 22 - 2
src/common/barrier_counter.c

@@ -62,9 +62,9 @@ int _starpu_barrier_counter_decrement_until_empty_counter(struct _starpu_barrier
 	int ret = 0;
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
 
+	barrier->reached_flops -= flops;
 	if (--barrier->reached_start == 0)
 	{
-		barrier->reached_flops -= flops;
 		ret = 1;
 		STARPU_PTHREAD_COND_BROADCAST(&barrier->cond);
 	}
@@ -79,9 +79,9 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 	int ret = 0;
 	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
 
+	barrier->reached_flops += flops;
 	if(++barrier->reached_start == barrier->count)
 	{
-		barrier->reached_flops += flops;
 		ret = 1;
 		STARPU_PTHREAD_COND_BROADCAST(&barrier_c->cond2);
 	}
@@ -113,3 +113,23 @@ int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
 	return 0;
 }
+
+int _starpu_barrier_counter_get_reached_start(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	int ret;
+//	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+	ret = barrier->reached_start;
+//	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return ret;
+}
+
+double _starpu_barrier_counter_get_reached_flops(struct _starpu_barrier_counter *barrier_c)
+{
+	struct _starpu_barrier *barrier = &barrier_c->barrier;
+	double ret;
+//	STARPU_PTHREAD_MUTEX_LOCK(&barrier->mutex);
+	ret = barrier->reached_flops;
+//	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier->mutex);
+	return ret;
+}

+ 3 - 0
src/common/barrier_counter.h

@@ -42,4 +42,7 @@ int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c,
 
 int _starpu_barrier_counter_check(struct _starpu_barrier_counter *barrier_c);
 
+int _starpu_barrier_counter_get_reached_start(struct _starpu_barrier_counter *barrier_c);
+
+double _starpu_barrier_counter_get_reached_flops(struct _starpu_barrier_counter *barrier_c);
 #endif

+ 15 - 0
src/common/fxt.h

@@ -149,6 +149,9 @@
 #define _STARPU_FUT_SCHED_NODE_PUSH_PRIO 	0x515a
 #define _STARPU_FUT_SCHED_NODE_POP_PRIO 	0x515b
 
+#define	_STARPU_FUT_HYPERVISOR_BEGIN    0x5160
+#define	_STARPU_FUT_HYPERVISOR_END	0x5161
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -456,6 +459,13 @@ do {										\
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL			\
 	FUT_DO_PROBE0(_STARPU_FUT_TASK_WAIT_FOR_ALL)
 
+#define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
+	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
+
+#define _STARPU_TRACE_HYPERVISOR_END() \
+	do {} while (0)
+//	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_END, _starpu_gettid());
+
 #ifdef STARPU_FXT_LOCK_TRACES 
 
 #define _STARPU_TRACE_LOCKING_MUTEX()	do { \
@@ -687,8 +697,13 @@ do {										\
 #define _STARPU_TRACE_MEMORY_FULL(size)				do {} while(0)
 #define _STARPU_TRACE_START_UNPARTITION(handle, memnode)	do {} while(0)
 #define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		do {} while(0)
+<<<<<<< .courant
 #define _STARPU_TRACE_SCHED_NODE_PUSH_PRIO(workerid, ntasks, exp_len)	do {} while(0)
 #define _STARPU_TRACE_SCHED_NODE_POP_PRIO(workerid, ntasks, exp_len)	do {} while(0)
+=======
+#define _STARPU_TRACE_HYPERVISOR_BEGIN()        do {} while(0)
+#define _STARPU_TRACE_HYPERVISOR_END()                  do {} while(0)
+>>>>>>> .fusion-droit.r11635
 
 #endif // STARPU_USE_FXT
 

+ 0 - 1
src/core/jobs.c

@@ -298,7 +298,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
 	_starpu_decrement_nsubmitted_tasks_of_sched_ctx(sched_ctx);
 	_starpu_decrement_nready_tasks_of_sched_ctx(sched_ctx, flops);
-
 	struct _starpu_worker *worker;
 	worker = _starpu_get_local_worker_key();
 	if (worker)

+ 69 - 15
src/core/sched_ctx.c

@@ -27,6 +27,10 @@ static starpu_pthread_mutex_t finished_submit_mutex = STARPU_PTHREAD_MUTEX_INITI
 struct starpu_task stop_submission_task = STARPU_TASK_INITIALIZER;
 starpu_pthread_key_t sched_ctx_key;
 unsigned with_hypervisor = 0;
+double hyp_start_sample[STARPU_NMAX_SCHED_CTXS];
+double hyp_start_allow_sample[STARPU_NMAX_SCHED_CTXS];
+double flops[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
+size_t data_size[STARPU_NMAX_SCHED_CTXS][STARPU_NMAXWORKERS];
 
 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
 
@@ -327,12 +331,6 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 		}
 	}
 
-	int w;
-	for(w = 0; w < STARPU_NMAXWORKERS; w++)
-	{
-		sched_ctx->pop_counter[w] = 0;
-	}
-
 	return sched_ctx;
 }
 
@@ -580,7 +578,11 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 #ifdef STARPU_USE_SC_HYPERVISOR
 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
 	   && sched_ctx->perf_counters != NULL)
+	{
+		_STARPU_TRACE_HYPERVISOR_BEGIN();
 		sched_ctx->perf_counters->notify_delete_context(sched_ctx_id);
+		_STARPU_TRACE_HYPERVISOR_END();
+	}
 #endif //STARPU_USE_SC_HYPERVISOR
 
 	unsigned inheritor_sched_ctx_id = sched_ctx->inheritor;
@@ -914,16 +916,16 @@ void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double r
 	_starpu_barrier_counter_decrement_until_empty_counter(&sched_ctx->ready_tasks_barrier, ready_flops);
 }
 
-int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id)
+int starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	return sched_ctx->ready_tasks_barrier.barrier.reached_start;
+	return _starpu_barrier_counter_get_reached_start(&sched_ctx->ready_tasks_barrier);
 }
 
-double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id)
+double starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	return sched_ctx->ready_tasks_barrier.barrier.reached_flops;
+	return _starpu_barrier_counter_get_reached_flops(&sched_ctx->ready_tasks_barrier);
 }
 
 int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id)
@@ -958,6 +960,17 @@ unsigned _starpu_sched_ctx_get_current_context()
 void starpu_sched_ctx_notify_hypervisor_exists()
 {
 	with_hypervisor = 1;
+	int i, j;
+	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
+	{
+		hyp_start_sample[i] = starpu_timing_now();
+		hyp_start_allow_sample[i] = 0.0;
+		for(j = 0; j < STARPU_NMAXWORKERS; j++)
+		{
+			flops[i][j] = 0.0;
+			data_size[i][j] = 0;
+		}
+	}
 }
 
 unsigned starpu_sched_ctx_check_if_hypervisor_exists()
@@ -965,6 +978,32 @@ unsigned starpu_sched_ctx_check_if_hypervisor_exists()
 	return with_hypervisor;
 }
 
+unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id)
+{
+	return 1;
+	double now = starpu_timing_now();
+	if(hyp_start_allow_sample[sched_ctx_id] > 0.0)
+	{
+		double allow_sample = (now - hyp_start_allow_sample[sched_ctx_id]) / 1000000.0;
+		if(allow_sample < 0.001)
+			return 1;
+		else
+		{
+			hyp_start_allow_sample[sched_ctx_id] = 0.0;
+			hyp_start_sample[sched_ctx_id] = starpu_timing_now();
+			return 0;
+		}
+	}
+	double forbid_sample = (now - hyp_start_sample[sched_ctx_id]) / 1000000.0;
+	if(forbid_sample > 0.01)
+	{
+//		hyp_start_sample[sched_ctx_id] = starpu_timing_now();
+		hyp_start_allow_sample[sched_ctx_id] = starpu_timing_now();
+		return 1;
+	}
+	return 0;
+}
+
 unsigned _starpu_get_nsched_ctxs()
 {
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
@@ -1198,14 +1237,25 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
 #ifdef STARPU_USE_SC_HYPERVISOR
 
-void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint)
+void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size2, uint32_t footprint)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	if(sched_ctx != NULL && task->sched_ctx != _starpu_get_initial_sched_ctx()->id && 
 	   task->sched_ctx != STARPU_NMAX_SCHED_CTXS  && sched_ctx->perf_counters != NULL)
-		sched_ctx->perf_counters->notify_post_exec_task(task, data_size, footprint, task->hypervisor_tag, 
-								_starpu_get_nready_tasks_of_sched_ctx(sched_ctx->id), 
-								_starpu_get_nready_flops_of_sched_ctx(sched_ctx->id));
+	{
+		flops[task->sched_ctx][workerid] += task->flops;
+		data_size[task->sched_ctx][workerid] += data_size2;
+
+		if(_starpu_sched_ctx_allow_hypervisor(sched_ctx->id) || task->hypervisor_tag > 0)
+		{
+			_STARPU_TRACE_HYPERVISOR_BEGIN();
+			sched_ctx->perf_counters->notify_post_exec_task(task, data_size[task->sched_ctx][workerid], footprint,
+									task->hypervisor_tag, flops[task->sched_ctx][workerid]);
+			_STARPU_TRACE_HYPERVISOR_END();
+			flops[task->sched_ctx][workerid] = 0.0;
+			data_size[task->sched_ctx][workerid] = 0;
+		}
+	}
 }
 
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
@@ -1213,8 +1263,12 @@ void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 
 	if(sched_ctx != NULL && sched_ctx_id != _starpu_get_initial_sched_ctx()->id && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
-	   && sched_ctx->perf_counters != NULL)
+	   && sched_ctx->perf_counters != NULL && _starpu_sched_ctx_allow_hypervisor(sched_ctx_id))
+	{
+		_STARPU_TRACE_HYPERVISOR_BEGIN();
 		sched_ctx->perf_counters->notify_pushed_task(sched_ctx_id, workerid);
+		_STARPU_TRACE_HYPERVISOR_END();
+	}
 }
 #endif //STARPU_USE_SC_HYPERVISOR
 

+ 2 - 6
src/core/sched_ctx.h

@@ -89,10 +89,6 @@ struct _starpu_sched_ctx
 	/* max GPUs to execute*/
 	int max_ngpus;
 
-	/* needed for overlapping contexts to help the workers
-	   determine which is the next context to pop tasks from */
-	unsigned pop_counter[STARPU_NMAXWORKERS];
-
 	/* in case we delete the context leave resources to the inheritor*/
 	unsigned inheritor;
 
@@ -152,8 +148,6 @@ int _starpu_check_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id);
 
 void _starpu_decrement_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
 void _starpu_increment_nready_tasks_of_sched_ctx(unsigned sched_ctx_id, double ready_flops);
-int _starpu_get_nready_tasks_of_sched_ctx(unsigned sched_ctx_id);
-double _starpu_get_nready_flops_of_sched_ctx(unsigned sched_ctx_id);
 int _starpu_wait_for_no_ready_of_sched_ctx(unsigned sched_ctx_id);
 
 /* Return the corresponding index of the workerid in the ctx table */
@@ -198,6 +192,8 @@ int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _star
 
 void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx);
 
+unsigned _starpu_sched_ctx_allow_hypervisor(unsigned sched_ctx_id);
+
 #ifdef STARPU_USE_SC_HYPERVISOR
 /* Notifies the hypervisor that a tasks was poped from the workers' list */
 void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task, size_t data_size, uint32_t footprint);

+ 39 - 19
src/core/sched_policy.c

@@ -338,7 +338,11 @@ int _starpu_push_task(struct _starpu_job *j)
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL 
 			   && sched_ctx->perf_counters->notify_empty_ctx)
+			{
+				_STARPU_TRACE_HYPERVISOR_BEGIN();
 				sched_ctx->perf_counters->notify_empty_ctx(sched_ctx->id, task);
+				_STARPU_TRACE_HYPERVISOR_END();
+			}
 #endif
 			return 0;
 		}
@@ -382,7 +386,11 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx != NULL && sched_ctx->id != 0 && sched_ctx->perf_counters != NULL 
 			   && sched_ctx->perf_counters->notify_empty_ctx)
+			{
+				_STARPU_TRACE_HYPERVISOR_BEGIN();
 				sched_ctx->perf_counters->notify_empty_ctx(sched_ctx->id, task);
+				_STARPU_TRACE_HYPERVISOR_END();
+			}
 #endif
 
 			return -EAGAIN;
@@ -568,29 +576,34 @@ struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker
 	struct _starpu_sched_ctx *sched_ctx, *good_sched_ctx = NULL;
 	unsigned smallest_counter =  worker->nsched_ctxs;
 	struct _starpu_sched_ctx_list *l = NULL;
-	for (l = worker->sched_ctx_list; l; l = l->next)
+	if(!worker->reverse_phase)
 	{
-		sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
-/* 		if(worker->removed_from_ctx[sched_ctx->id] == 1 && worker->shares_tasks_lists[sched_ctx->id] == 1) */
-/* 			return sched_ctx; */
-		if(sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
-		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
+		/* find a context in which the worker hasn't poped yet */
+		for (l = worker->sched_ctx_list; l; l = l->next)
 		{
-			good_sched_ctx = sched_ctx;
-			smallest_counter = sched_ctx->pop_counter[worker->workerid];
+			if(!worker->poped_in_ctx[l->sched_ctx])
+			{
+				worker->poped_in_ctx[l->sched_ctx] = !worker->poped_in_ctx[l->sched_ctx];
+				return	_starpu_get_sched_ctx_struct(l->sched_ctx);
+			}
 		}
+		worker->reverse_phase = !worker->reverse_phase;
 	}
-	
-	if(good_sched_ctx == NULL)
+	if(worker->reverse_phase)
 	{
+		/* if the context has already poped in every one start from the begining */
 		for (l = worker->sched_ctx_list; l; l = l->next)
 		{
-			sched_ctx = _starpu_get_sched_ctx_struct(l->sched_ctx);
-			sched_ctx->pop_counter[worker->workerid] = 0;
+			if(worker->poped_in_ctx[l->sched_ctx])
+			{
+				worker->poped_in_ctx[l->sched_ctx] = !worker->poped_in_ctx[l->sched_ctx];
+				return	_starpu_get_sched_ctx_struct(l->sched_ctx);
+			}
 		}
-		return _starpu_get_sched_ctx_struct(worker->sched_ctx_list->sched_ctx);
-	}
-	return good_sched_ctx;
+		worker->reverse_phase = !worker->reverse_phase;
+	}	
+	worker->poped_in_ctx[worker->sched_ctx_list->sched_ctx] = !worker->poped_in_ctx[worker->sched_ctx_list->sched_ctx];
+	return _starpu_get_sched_ctx_struct(worker->sched_ctx_list->sched_ctx);
 }
 
 struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker)
@@ -665,17 +678,20 @@ pick:
 				}
 #ifdef STARPU_USE_SC_HYPERVISOR
 				struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
-				if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle)
+				if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_idle_cycle && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id))
+				{
+//					_STARPU_TRACE_HYPERVISOR_BEGIN();
 					perf_counters->notify_idle_cycle(sched_ctx->id, worker->workerid, 1.0);
+//					_STARPU_TRACE_HYPERVISOR_END();
+				}
 #endif //STARPU_USE_SC_HYPERVISOR
 				
 #ifndef STARPU_NON_BLOCKING_DRIVERS
-				if((sched_ctx->pop_counter[worker->workerid] == 0 && been_here[sched_ctx->id]) || worker->nsched_ctxs == 1)
+				if(been_here[sched_ctx->id] || worker->nsched_ctxs == 1)
 					break;
 				been_here[sched_ctx->id] = 1;
 #endif
 			}
-			sched_ctx->pop_counter[worker->workerid]++;
 		}
 	  }
 
@@ -689,8 +705,12 @@ pick:
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
 	struct starpu_sched_ctx_performance_counters *perf_counters = sched_ctx->perf_counters;
 
-	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task)
+	if(sched_ctx->id != 0 && perf_counters != NULL && perf_counters->notify_poped_task && _starpu_sched_ctx_allow_hypervisor(sched_ctx->id))
+	{
+//		_STARPU_TRACE_HYPERVISOR_BEGIN();
 		perf_counters->notify_poped_task(task->sched_ctx, worker->workerid);
+//		_STARPU_TRACE_HYPERVISOR_END();
+	}
 #endif //STARPU_USE_SC_HYPERVISOR
 
 

+ 5 - 3
src/core/task.c

@@ -26,6 +26,7 @@
 #include <core/task_bundle.h>
 #include <common/config.h>
 #include <common/utils.h>
+#include <common/fxt.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <math.h>
@@ -259,7 +260,9 @@ int _starpu_submit_job(struct _starpu_job *j)
 				data_size += _starpu_data_get_size(handle);
 		}
 
+		_STARPU_TRACE_HYPERVISOR_BEGIN();
 		sched_ctx->perf_counters->notify_submitted_job(j->task, j->footprint, data_size);
+		_STARPU_TRACE_HYPERVISOR_END();
 	}
 #endif//STARPU_USE_SC_HYPERVISOR
 
@@ -659,7 +662,6 @@ int _starpu_task_submit_conversion_task(struct starpu_task *task,
 	STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
 	j->submitted = 1;
 	_starpu_increment_nready_tasks_of_sched_ctx(j->task->sched_ctx, j->task->flops);
-
 	for (i=0 ; i<task->cl->nbuffers ; i++)
 	{
 		starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(j->task, i);
@@ -852,7 +854,7 @@ int starpu_task_nready(void)
 	int nready = 0;
 	struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config();
 	if(config->topology.nsched_ctxs == 1)
-		nready = _starpu_get_nready_tasks_of_sched_ctx(0);
+		nready = starpu_get_nready_tasks_of_sched_ctx(0);
 	else
 	{
 		int s;
@@ -860,7 +862,7 @@ int starpu_task_nready(void)
 		{
 			if(config->sched_ctxs[s].id != STARPU_NMAX_SCHED_CTXS)
 			{
-				nready += _starpu_get_nready_tasks_of_sched_ctx(config->sched_ctxs[s].id);
+				nready += starpu_get_nready_tasks_of_sched_ctx(config->sched_ctxs[s].id);
 			}
 		}
 	}

+ 4 - 0
src/core/workers.c

@@ -440,7 +440,11 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->parallel_sect = 0;
 
 	for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
+	{
 		workerarg->shares_tasks_lists[ctx] = 0;
+		workerarg->poped_in_ctx[ctx] = 0;
+	}
+	workerarg->reverse_phase = 0;
 
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }

+ 8 - 0
src/core/workers.h

@@ -106,6 +106,14 @@ LIST_TYPE(_starpu_worker,
 	/* in this case when removing him from a context it disapears instantly */
 	unsigned shares_tasks_lists[STARPU_NMAX_SCHED_CTXS];
 
+        /* boolean to chose the next ctx a worker will pop into */
+	unsigned poped_in_ctx[STARPU_NMAX_SCHED_CTXS];	  
+
+       /* boolean indicating at which moment we checked all ctxs and change phase for the booleab poped_in_ctx*/
+	unsigned reverse_phase;
+
+
+
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;
 #endif /* __GLIBC__ */

+ 0 - 1
src/datawizard/interfaces/data_interface.c

@@ -685,7 +685,6 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 	/* Wait for all requests to finish (notably WT requests) */
 	STARPU_PTHREAD_MUTEX_LOCK(&handle->busy_mutex);
 	while (1) {
-		int busy;
 		/* Here helgrind would shout that this an unprotected access,
 		 * but this is actually fine: all threads who do busy_count--
 		 * are supposed to call _starpu_data_check_not_busy, which will

+ 30 - 0
src/debug/traces/starpu_fxt.c

@@ -685,6 +685,28 @@ static void handle_end_callback(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "B");
 }
 
+static void handle_hyp_begin(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	int worker;
+	worker = find_worker_id(ev->param[0]);
+	if (worker < 0)
+		return;
+
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "H");
+}
+
+static void handle_hyp_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+/* 	int worker; */
+/* 	worker = find_worker_id(ev->param[0]); */
+/* 	if (worker < 0) */
+/* 		return; */
+
+/* 	if (out_paje_file) */
+/* 		worker_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B"); */
+}
+
 static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
 {
 	int worker;
@@ -1679,6 +1701,14 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 			case _STARPU_FUT_SCHED_NODE_PUSH_PRIO:
 				break;
 
+			case _STARPU_FUT_HYPERVISOR_BEGIN:
+				handle_hyp_begin(&ev, options);
+				break;
+
+			case _STARPU_FUT_HYPERVISOR_END:
+				handle_hyp_end(&ev, options);
+				break;
+
 			default:
 #ifdef STARPU_VERBOSE
 				fprintf(stderr, "unknown event.. %x at time %llx WITH OFFSET %llx\n",

+ 3 - 1
src/debug/traces/starpu_paje.c

@@ -194,6 +194,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
+		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
 	}
 
 	/* Types for the Scheduler */
@@ -233,7 +234,8 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       B       S       Overhead         \".5 .18 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       P       S       Progressing         \".4 .1 .6\"		\n\
-6       U       S       Unpartitioning      \".0 .0 1.0\"		\n");
+6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
+6       H       S       Hypervisor      \".5 .18 .0\"		\n");
 	fprintf(file, "\
 6       P       CtS       Processing         \"0 0 0\"		\n\
 6       Sl       CtS      Sleeping         \".9 .1 .0\"		\n\