6 years ago · 4c9baf958b
--- a/AUTHORS
+++ b/AUTHORS
@@ -21,6 +21,7 @@ Khorsi Yanis, Inria, <yanis.khorsi@inria.fr>
 
																 Lambert Thibaut, Inria, <thibaud.lambert@inria.fr>
															
 
																 Leria Erwan, University of Bordeaux, <erwan.leria@etu.u-bordeaux.fr>
															
 
																 Lizé Benoît, Airbus, <benoit.lize@gmail.com>
															
 
																+Makni Mariem, Inria, <mariem.makni@inria.fr>
															
 
																 Nakov Stojce, Inria, <stojce.nakov@inria.fr>
															
 
																 Namyst Raymond, University of Bordeaux, <raymond.namyst@labri.fr>
															
 
																 Nesi Lucas Leandro, Federal University of Rio Grande do Sul (UFRGS), <llnesi@inf.ufrgs.br>
															
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -504,8 +504,10 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
																 It can also be convenient to try simulated benchmarks, if you want to give a try
															
 
																 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
															
 
																 using the simgrid version of StarPU: first install the simgrid simulator from
															
 
																-http://simgrid.gforge.inria.fr/ (we tested with simgrid 3.11, 3.12 and 3.13, other versions
															
 
																-may have compatibility issues), then configure StarPU with \ref enable-simgrid
															
 
																+http://simgrid.gforge.inria.fr/ (we tested with simgrid from 3.11 to 3.16, and
															
 
																+3.18 to 3.22, other versions may have compatibility issues, 3.17 notably does
															
 
																+not build at all. MPI simulation does not work with version 3.22),
															
 
																+then configure StarPU with \ref enable-simgrid
															
 
																 "--enable-simgrid" and rebuild and install it, and then you can simulate the performance for a
															
 
																 few virtualized systems shipped along StarPU: attila, mirage, idgraf, and sirocco.
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -166,20 +166,34 @@ be obtained from the machine power supplier.
 
																 The energy actually consumed by the total execution can be displayed by setting
															
 
																 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
															
 
																-On-line task consumption measurement is currently only supported through the
															
 
																+For OpenCL devices, on-line task consumption measurement is currently supported through the
															
 
																 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
															
 
																-simulator. Applications can however provide explicit measurements by
															
 
																-using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
															
 
																-with the <c>energy_model</c> performance model). Fine-grain
															
 
																-measurement is often not feasible with the feedback provided by the hardware, so
															
 
																-the user can for instance run a given task a thousand times, measure the global
															
 
																+simulator.
															
 
																+
															
 
																+For CUDA devices, on-line task consumption measurement is supported on V100
															
 
																+cards and beyond. This however only works for quite long tasks, since the
															
 
																+measurement granularity is about 10ms.
															
 
																+
															
 
																+Applications can however provide explicit measurements by using the function
															
 
																+starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
															
 
																+with the <c>energy_model</c> performance model). Fine-grain measurement
															
 
																+is often not feasible with the feedback provided by the hardware, so the
															
 
																+user can for instance run a given task a thousand times, measure the global
															
 
																 consumption for that series of tasks, divide it by a thousand, repeat for
															
 
																-varying kinds of tasks and task sizes, and eventually feed StarPU
															
 
																-with these manual measurements through starpu_perfmodel_update_history().
															
 
																-For instance, for CUDA devices, <c>nvidia-smi -q -d POWER</c> can be used to get
															
 
																-the current consumption in Watt. Multiplying this value by the average duration
															
 
																-of a single task gives the consumption of the task in Joules, which can be given
															
 
																-to starpu_perfmodel_update_history().
															
 
																+varying kinds of tasks and task sizes, and eventually feed StarPU with these
															
 
																+manual measurements through starpu_perfmodel_update_history().  For instance,
															
 
																+for CUDA devices, <c>nvidia-smi -q -d POWER</c> can be used to get the current
															
 
																+consumption in Watt. Multiplying this value by the average duration of a
															
 
																+single task gives the consumption of the task in Joules, which can be given to
															
 
																+starpu_perfmodel_update_history().
															
 
																+
															
 
																+Another way to provide the energy performance is to define a
															
 
																+perfmodel with starpu_perfmodel::type ::STARPU_PER_ARCH, and set the
															
 
																+starpu_perfmodel::arch_cost_function field to a function which shall return the
															
 
																+estimated consumption of the task in Joules. Such a function can for instance
															
 
																+use starpu_task_expected_length() on the task (in µs), multiplied by the
															
 
																+typical power consumption of the device, e.g. in W, and divided by 1000000. to
															
 
																+get Joules.
															
 
																 \section ExistingModularizedSchedulers Modularized Schedulers
															
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -16,6 +16,10 @@
 
																  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																  */
															
 
																+/*
															
 
																+ * NOTE: XXX: also update simgrid versions in 101_building.doxy !!
															
 
																+ */
															
 
																+
															
 
																 /*! \page SimGridSupport SimGrid Support
															
 
																 StarPU can use Simgrid in order to simulate execution on an arbitrary
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -742,7 +742,7 @@ static void check_per_arch_model(struct starpu_perfmodel *model, int comb, unsig
 
																 	struct starpu_perfmodel_history_list *ptr = NULL;
															
 
																 	unsigned nentries = 0;
															
 
																-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED  || model->type == STARPU_REGRESSION_BASED)
															
 
																 	{
															
 
																 		/* Dump the list of all entries in the history */
															
 
																 		ptr = per_arch_model->list;
															
@@ -760,7 +760,7 @@ static void check_per_arch_model(struct starpu_perfmodel *model, int comb, unsig
 
																 	check_reg_model(model, comb, impl);
															
 
																 	/* Dump the history into the model file in case it is necessary */
															
 
																-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
															
 
																 	{
															
 
																 		ptr = per_arch_model->list;
															
 
																 		while (ptr)
															
@@ -779,7 +779,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 
																 	struct starpu_perfmodel_history_list *ptr = NULL;
															
 
																 	unsigned nentries = 0;
															
 
																-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+       if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
															
 
																 	{
															
 
																 		/* Dump the list of all entries in the history */
															
 
																 		ptr = per_arch_model->list;
															
@@ -800,7 +800,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 
																 	dump_reg_model(f, model, comb, impl);
															
 
																 	/* Dump the history into the model file in case it is necessary */
															
 
																-	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+       if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
															
 
																 	{
															
 
																 		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us)\tdev (us)\tsum\t\tsum2\t\tn\n");
															
 
																 		ptr = per_arch_model->list;
															
@@ -1861,7 +1861,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
																 			model->state->per_arch_is_set[comb][impl] = 1;
															
 
																 		}
															
 
																-		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
															
 
																 		{
															
 
																 			struct starpu_perfmodel_history_entry *entry;
															
 
																 			struct starpu_perfmodel_history_table *elt;