Browse Source

Add more energy support

  * Add energy accounting in the simgrid mode: starpu_energy_use() and
    starpu_energy_used().

  * Add a synthetic energy efficiency testcase.
Samuel Thibault 5 years ago
parent
commit
4e71fdd318

+ 3 - 0
ChangeLog

@@ -31,9 +31,12 @@ New features:
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
   * New STARPU_PER_WORKER perfmodel.
+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
+    starpu_energy_used().
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
+  * Add a synthetic energy efficiency testcase.
 
 StarPU 1.3.4 (git revision xxx)
 ==============================================

+ 3 - 0
configure.ac

@@ -3519,6 +3519,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  mkdir -p tests/energy
+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap

+ 14 - 0
include/starpu_stdlib.h

@@ -251,6 +251,20 @@ void starpu_sleep(float nb_sec);
   */
 void starpu_usleep(float nb_micro_sec);
 
+/**
+   Account for \p joules J being used.
+   This is support in simgrid mode, to record how much energy was used, and will
+   show up in further call to starpu_energy_used().
+  */
+void starpu_energy_use(float joules);
+
+/**
+   Return the amount of energy having been used in J.
+   This account the amounts passed to starpu_energy_use(), but also the static
+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
+  */
+double starpu_energy_used(void);
+
 /** @} */
 
 #ifdef __cplusplus

+ 1 - 1
include/starpu_task.h

@@ -513,7 +513,7 @@ struct starpu_codelet
 
 	/**
 	   Optional pointer to the task energy consumption performance
-	   model associated to this codelet. This optional field is
+	   model associated to this codelet (in J). This optional field is
 	   ignored when set to <c>NULL</c> or when its field
 	   starpu_perfmodel::symbol is not set. In the case of
 	   parallel codelets, this has to account for all processing

+ 24 - 2
src/core/simgrid.c

@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 extern void smpi_process_set_user_data(void *);
 #endif
 
+static double _starpu_simgrid_dynamic_energy = 0.0;
+
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
  * initialized through MSG_process_attach */
 static int simgrid_started;
@@ -629,6 +631,7 @@ struct task
 #else
 	msg_task_t task;
 #endif
+	double energy;
 
 	/* communication termination signalization */
 	unsigned *finished;
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 		MSG_task_execute(task->task);
 		MSG_task_destroy(task->task);
 #endif
+		starpu_energy_use(task->energy);
 		_STARPU_DEBUG("task %p finished\n", task);
 
 		*task->finished = 1;
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 }
 
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
 {
 	struct starpu_task *starpu_task = j->task;
 	double flops;
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 	if (isnan(length))
 	{
-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */
 	}
+	if (isnan(energy))
+	{
+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
+		/* TODO: option to add variance according to performance model,
+		 * to be able to easily check scheduling robustness */
+	}
 
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		MSG_task_execute(simgrid_task);
 		MSG_task_destroy(simgrid_task);
 #endif
+		starpu_energy_use(energy);
 	}
 	else
 	{
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 #else
 		task->task = simgrid_task;
 #endif
+		task->energy = energy;
 		task->finished = finished;
 		*finished = 0;
 		task->next = NULL;
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 }
 #endif
 
+void starpu_energy_use(float joules)
+{
+	_starpu_simgrid_dynamic_energy += joules;
+}
+
+double starpu_energy_used(void)
+{
+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
+}
 
 #endif

+ 1 - 1
src/core/simgrid.h

@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 void _starpu_simgrid_actor_setup(void);
 void _starpu_simgrid_wait_tasks(int workerid);
 struct _starpu_job;
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
 struct _starpu_data_request;
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 union _starpu_async_channel_event;

+ 4 - 1
src/drivers/cpu/driver_cpu.c

@@ -108,7 +108,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 				_SIMGRID_TIMER_END;
 			}
 			else
-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
+			{
+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
+			}
 #else
 #  ifdef STARPU_PAPI
 			_starpu_profiling_papi_task_start_counters(task);

+ 4 - 1
src/drivers/cuda/driver_cuda.c

@@ -531,8 +531,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 				_SIMGRID_TIMER_END;
 			}
 		else
-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
+		}
 #else
 #ifdef HAVE_LIBNVIDIA_ML
 		unsigned long long energy_start = 0;

+ 6 - 1
src/drivers/opencl/driver_opencl.c

@@ -948,6 +948,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		double length = NAN;
+		double energy = NAN;
 		int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
 		int simulate = 1;
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
@@ -976,6 +977,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 #else
 			length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
 #endif
+			energy = info->energy_consumed;
 			/* And give the simulated time to simgrid */
 			simulate = 1;
 #endif
@@ -989,8 +991,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			}
 
 		if (simulate)
-			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
+		}
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 

+ 1 - 2
src/sched_policies/component_best_implementation.c

@@ -41,12 +41,11 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 	}
 	else
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
 		{
 			if(starpu_worker_can_execute_task(workerid, task, impl))
 			{
-				double d = starpu_task_expected_length(task, archtype, impl);
+				double d = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, impl);
 				if(isnan(d))
 				{
 					best_impl = impl;

+ 4 - 0
tests/Makefile.am

@@ -42,6 +42,9 @@ EXTRA_DIST =					\
 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
+	energy/static.sh			\
+	energy/dynamic.sh			\
+	energy/perfs.gp				\
 	datawizard/scratch_opencl_kernel.cl     \
 	datawizard/sync_and_notify_data_opencl_codelet.cl\
 	datawizard/opencl_codelet_unsigned_inc_kernel.cl \
@@ -212,6 +215,7 @@ myPROGRAMS +=					\
 
 if STARPU_SIMGRID
 myPROGRAMS +=					\
+	energy/energy_efficiency		\
 	datawizard/simgrid-locality
 endif
 

+ 54 - 0
tests/energy/dynamic.sh

@@ -0,0 +1,54 @@
+#!/bin/sh
+
+# To have 24 cores
+export STARPU_HOSTNAME=sirocco
+
+# To avoid slowing down simulation
+export MALLOC_PERTURB_=0
+
+# You can play with these
+export N=40
+export NITER=30
+
+GAMMAS="1000000 100000 76000 10000 0"
+
+for gamma in $GAMMAS; do
+	(for freq_slow in $(seq 1200 200 3500) ; do 
+		STARPU_SCHED_GAMMA=$gamma STARPU_FREQ_SLOW=$freq_slow \
+			./energy_efficiency $N $NITER | grep "^$(($N * 512))	" &
+	done) | sort -n -k 2 > dynamic.$gamma.dat
+done
+
+cat > dynamic.gp << EOF
+set output "dynamic.eps"
+set term postscript eps enhanced color font ",20"
+set key bottom right
+set xlabel "performance (GFlop/s)"
+set ylabel "energy (J)"
+
+plot \\
+EOF
+for gamma in $GAMMAS; do
+	cat >> dynamic.gp << EOF
+	"dynamic.$gamma.dat" using 5:7:6:8 with xyerrorlines lw 2 title "$gamma", \\
+EOF
+done
+
+cat >> dynamic.gp << EOF
+
+set output "dynamic-time.eps"
+set xlabel "time (ms)"
+set ylabel "energy (J)"
+
+plot \\
+EOF
+for gamma in $GAMMAS; do
+	cat >> dynamic.gp << EOF
+	"dynamic.$gamma.dat" using 3:7:4:8 with xyerrorlines lw 2 title "$gamma", \\
+EOF
+done
+
+
+gnuplot dynamic.gp
+gv dynamic.eps &
+gv dynamic-time.eps &

+ 496 - 0
tests/energy/energy_efficiency.c

@@ -0,0 +1,496 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2016       Bérangère Subervie
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdbool.h>
+#include <starpu.h>
+#include "../helper.h"
+
+/*
+ * This tries to run kernels with different efficiency depending on the core
+ * frequency.
+ *
+ * This is based on the Cholesky factorization, which is made to exhibit three
+ * caricatural cases as follows:
+ *
+ * - gemm: always get faster with higher frequency
+ * - trsm: gets faster with higher frequency, but efficiency gets lower and
+ * lower
+ * - potrf: reaches a maximum performance, after which there is no point in
+ * running it at higher frequency.
+ *
+ * We here assume that the power use is the same for the different kernels
+ * (which wouldn't be true for real kernels, measurements would be needed, to
+ * feed the performance models).
+ */
+
+
+/* These are the different frequency and power parameters, as measured and
+ * provided to this program */
+static float freq_min, freq_fast;
+static float power_min, power_fast;
+
+/*
+ * This returns the dynamic power used by a CPU core in W at a given frequency
+ * in MHz
+ * This assumes C.V^2.F with V being proportional to F, thus C.F^3
+
+   freq_min = 1200
+   freq_fast = 3500
+   power_min = 2
+   power_fast = 8.2
+
+   freq_min3 = freq_min * freq_min * freq_min
+   freq_fast3 = freq_fast * freq_fast * freq_fast
+   alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
+   power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
+   plot [frequency=freq_min:freq_fast] power(frequency) lw 2
+
+ */
+static float power(float frequency)
+{
+	double freq_min3 = freq_min * freq_min * freq_min;
+	double freq_fast3 = freq_fast * freq_fast * freq_fast;
+	double alpha = (power_fast - power_min) / (freq_fast3 - freq_min3);
+	return power_min + alpha * ( frequency*frequency*frequency - freq_min3);
+}
+
+
+/*
+ * This returns the frequency of the given worker and implementation in MHz.
+ * This is where we can tune either a given number of cores at a low frequency,
+ * or which implementation uses which frequency. */
+
+/* These are the chosen parameters: how many cores get slowed down, at which
+ * frequency */
+static int ncpu_slow = -1;
+static float freq_slow;
+
+static float frequency(int worker, unsigned i)
+{
+	if (ncpu_slow == -1)
+	{
+		/* Version that allows the runtime to switch speed between
+		 * tasks, by exposing two implementations with different time
+		 * and energy */
+		if (i == 0)
+			/* Slow implementation */
+			return freq_slow;
+		else
+			/* Fast implementation */
+			return freq_fast;
+	}
+	else
+	{
+		/* Version that assumes that ncpu_slow workers are running at
+		 * slow speed */
+		if (worker < ncpu_slow)
+			return freq_slow;
+		else
+			return freq_fast;
+	}
+}
+
+
+/* This is from magma
+
+  -- Innovative Computing Laboratory
+  -- Electrical Engineering and Computer Science Department
+  -- University of Tennessee
+  -- (C) Copyright 2009
+
+  Redistribution  and  use  in  source and binary forms, with or without
+  modification,  are  permitted  provided  that the following conditions
+  are met:
+
+  * Redistributions  of  source  code  must  retain  the above copyright
+    notice,  this  list  of  conditions  and  the  following  disclaimer.
+  * Redistributions  in  binary  form must reproduce the above copyright
+    notice,  this list of conditions and the following disclaimer in the
+    documentation  and/or other materials provided with the distribution.
+  * Neither  the  name of the University of Tennessee, Knoxville nor the
+    names of its contributors may be used to endorse or promote products
+    derived from this software without specific prior written permission.
+
+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  */
+
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
+
+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
+
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
+
+#define FMULS_TRSM FMULS_TRMM
+#define FADDS_TRSM FMULS_TRMM
+
+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
+
+
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
+
+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
+
+
+
+/* Tags for spotting tasks in the trace */
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+/* Arbitrary tile size */
+#define	TILE_SIZE	512
+
+
+/*
+ * Kernel time performance models, would normally be provided by measurements
+ */
+
+/* We assume that GEMM scales perfectly with frequency */
+#define GEMM_GFLOPS 50.	/* At full speed */
+#define GEMM_FLOPS(N) FLOPS_SGEMM(N, N, N)
+#define GEMM_TIME(N) (GEMM_FLOPS(TILE_SIZE) / (GEMM_GFLOPS * 1000000000.))
+static double _gemm_time(float frequency)
+{
+	double ret = GEMM_TIME(N);
+
+	/* Fix according to real frequency, linear */
+	ret = GEMM_TIME(N) / (frequency / freq_fast);
+	return ret * 1000000.;
+}
+
+static double gemm_time(struct starpu_task *t, unsigned workerid, unsigned i)
+{
+	(void)t;
+	return _gemm_time(frequency(workerid, i));
+}
+
+/* We assume that TRSM decays a bit with frequency */
+#define TRSM_DECAY 0.5
+#define TRSM_FLOPS(N) FLOPS_STRSM(N, N)
+static double _trsm_time(float frequency)
+{
+	double ret = GEMM_TIME(N)*0.7; /* as typically observed */
+
+	/* Fix according to real frequency, root */
+	ret = ret / (pow(frequency - freq_min/2, TRSM_DECAY) / pow(freq_fast - freq_min/2, TRSM_DECAY));
+	return ret * 1000000.;
+}
+
+static double trsm_time(struct starpu_task *t, unsigned workerid, unsigned i)
+{
+	(void)t;
+	return _trsm_time(frequency(workerid, i));
+}
+
+/* We assume that POTRF decays strongly with frequency */
+#define POTRF_DECAY 0.5
+#define POTRF_FLOPS(N) FLOPS_SPOTRF(N)
+static double _potrf_time(float frequency)
+{
+	double ret = GEMM_TIME(N)*1.2; /* as typically observed */
+
+	/* Fix according to real frequency, asymptote */
+	ret = ret / (1. - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2))));
+	return ret * 1000000.;
+}
+static double potrf_time(struct starpu_task *t, unsigned workerid, unsigned i)
+{
+	(void)t;
+	return _potrf_time(frequency(workerid, i));
+}
+
+
+/* stub for kernel, shouldn't be getting called in simgrid mode */
+void dummy_func(void *descr[], void *_args)
+{
+	(void)descr; (void)_args;
+	fprintf(stderr, "?? shouldn't be called\n");
+}
+
+/* Define the codelets */
+#define CODELET(kernel, nb, ...) \
+static double kernel##_energy(struct starpu_task *t, unsigned workerid, unsigned i) \
+{ \
+	double time = kernel##_time(t, workerid, i); \
+	return power(frequency(workerid, i)) * time / 1000000.; \
+} \
+\
+static struct starpu_perfmodel kernel##_perf_model = \
+{ \
+	.symbol = #kernel, \
+	.type = STARPU_PER_WORKER, \
+	.worker_cost_function = kernel##_time, \
+}; \
+\
+static struct starpu_perfmodel kernel##_energy_model = \
+{ \
+	.symbol = #kernel "_energy", \
+	.type = STARPU_PER_WORKER, \
+	.worker_cost_function = kernel##_energy, \
+}; \
+\
+static struct starpu_codelet kernel##_cl = \
+{ \
+	.cpu_funcs = { dummy_func }, \
+	.nbuffers = nb, \
+	.modes = {__VA_ARGS__}, \
+	.model = &kernel##_perf_model, \
+	.energy_model = &kernel##_energy_model, \
+};
+
+CODELET(potrf, 1, STARPU_RW)
+CODELET(trsm, 2, STARPU_R, STARPU_RW)
+CODELET(gemm, 3, STARPU_R, STARPU_R, STARPU_RW)
+
+
+int main(int argc, char *argv[]) {
+	/* Initialize environment variables */
+
+	if (!getenv("STARPU_IDLE_POWER"))
+		setenv("STARPU_IDLE_POWER", "30", 1);
+	const char *hostname = getenv("STARPU_HOSTNAME");
+	if (!hostname || strcmp(hostname, "sirocco"))
+	{
+		printf("Warning: This is expected to be run with export STARPU_HOSTNAME=sirocco\n");
+	}
+
+	freq_min =  starpu_get_env_number_default("STARPU_FREQ_MIN", 1200);
+	freq_slow =  starpu_get_env_number_default("STARPU_FREQ_SLOW", 1200);
+	freq_fast =  starpu_get_env_number_default("STARPU_FREQ_FAST", 3500);
+
+	power_min =  starpu_get_env_float_default("STARPU_POWER_MIN", 2);
+	power_fast =  starpu_get_env_float_default("STARPU_POWER_FAST", 8.2);
+
+	/* Number of slow CPU cores */
+	ncpu_slow = starpu_get_env_number_default("STARPU_NCPU_SLOW", -1);
+	if (ncpu_slow == -1)
+	{
+		/* Enable second implementation.  */
+		potrf_cl.cpu_funcs[1] = dummy_func;
+		trsm_cl.cpu_funcs[1] = dummy_func;
+		gemm_cl.cpu_funcs[1] = dummy_func;
+	}
+
+	/* Initialize StarPU */
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+
+	conf.ncuda = 0;
+	conf.nopencl = 0;
+	if (!getenv("STARPU_SCHED"))
+		conf.sched_policy_name = "dmdas";
+
+	int ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	unsigned N, k, m, n, iter, NITER;
+	if (argc < 2)
+		N = 40;
+	else
+		N = atoi(argv[1]);
+	if (argc < 3)
+		NITER = 10;
+	else
+		NITER = atoi(argv[2]);
+	if (N == 0)
+	{
+		starpu_shutdown();
+		return 0;
+	}
+
+
+	/* Give parameter summary to user */
+
+	printf("freqs (MHz):\n");
+	printf("%f %f %f\n", freq_min, freq_slow, freq_fast);
+	printf("\n");
+
+	printf("per-core power (W):\n");
+	printf("%f %f\n", power_min, power_fast);
+	printf("%f %f %f\n", power(freq_min), power(freq_slow), power(freq_fast));
+	printf("\n");
+
+	printf("kernel perfs in GFlops (min, slow, fast):\n");
+	printf("gemm:\t%f %f %f\n",
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000,
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000,
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000);
+
+	printf("trsm:\t%f %f %f\n",
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000,
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000,
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000);
+
+	printf("potrf:\t%f %f %f\n",
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000,
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000,
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000);
+	printf("\n");
+
+	printf("kernel efficiency in GFlops/W (min, slow, fast):\n");
+	printf("gemm:\t%f %f %f\n",
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000 / power(freq_min),
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000 / power(freq_slow),
+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000 / power(freq_fast));
+
+	printf("trsm:\t%f %f %f\n",
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000 / power(freq_min),
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000 / power(freq_slow),
+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000 / power(freq_fast));
+
+	printf("potrf:\t%f %f %f\n",
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000 / power(freq_min),
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000 / power(freq_slow),
+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000 / power(freq_fast));
+	printf("\n");
+
+
+	/* Now compute */
+
+	starpu_data_handle_t A[N][N];
+
+	for (m = 0; m < N; m++)
+		for (n = 0; n < N; n++)
+			starpu_void_data_register(&A[m][n]);
+
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	double timing_sum = 0.;
+	double energy_sum = 0.;
+	double timing_sum2 = 0.;
+	double energy_sum2 = 0.;
+
+	for (iter = 0; iter < NITER; iter++)
+	{
+		double start = starpu_timing_now();
+		double start_energy = starpu_energy_used();
+
+		for (k = 0; k < N; k++)
+		{
+			starpu_iteration_push(k);
+			ret = starpu_task_insert(&potrf_cl,
+						 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k) : STARPU_MAX_PRIO,
+						 STARPU_RW, A[k][k],
+						 STARPU_FLOPS, (double) FLOPS_SPOTRF(TILE_SIZE),
+						 STARPU_TAG_ONLY, TAG11(k),
+						 0);
+			if (ret == -ENODEV) return 77;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+
+			for (m = k+1; m<N; m++)
+			{
+				ret = starpu_task_insert(&trsm_cl,
+							 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							 STARPU_R, A[k][k],
+							 STARPU_RW, A[m][k],
+							 STARPU_FLOPS, (double) FLOPS_STRSM(TILE_SIZE, TILE_SIZE),
+							 STARPU_TAG_ONLY, TAG21(m,k),
+							 0);
+				if (ret == -ENODEV) return 77;
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			}
+
+			for (m = k+1; m<N; m++)
+			{
+				for (n = k+1; n<N; n++)
+				{
+					if (n <= m)
+					{
+						ret = starpu_task_insert(&gemm_cl,
+									 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+									 STARPU_R, A[m][k],
+									 STARPU_R, A[n][k],
+									 gemm_cl.modes[2], A[m][n],
+									 STARPU_FLOPS, (double) FLOPS_SGEMM(TILE_SIZE, TILE_SIZE, TILE_SIZE),
+									 STARPU_TAG_ONLY, TAG22(k,m,n),
+									 0);
+						if (ret == -ENODEV) return 77;
+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+					}
+				}
+			}
+			starpu_iteration_pop();
+		}
+
+		starpu_task_wait_for_all();
+
+		double end = starpu_timing_now();
+		double end_energy = starpu_energy_used();
+		double timing = end - start;
+		double energy = end_energy - start_energy;
+		timing_sum += timing;
+		timing_sum2 += timing*timing;
+		energy_sum += energy;
+		energy_sum2 += energy*energy;
+	}
+
+
+	/* Make stats and print */
+
+	double timing_avg = timing_sum / NITER;
+	double timing_dev = sqrt((fabs(timing_sum2 - (timing_sum*timing_sum)/NITER))/NITER);
+	double energy_avg = energy_sum / NITER;
+	double energy_dev = sqrt((fabs(energy_sum2 - (energy_sum*energy_sum)/NITER))/NITER);
+	double flop = FLOPS_SPOTRF(TILE_SIZE * N);
+
+	unsigned toprint_slow;
+	if (ncpu_slow >= 0)
+		toprint_slow = ncpu_slow;
+	else
+		toprint_slow = freq_slow;
+
+	printf("# size\t%s\tms +-\tGFlop/s +-\ten. (J) +-\tGF/W\n",
+			ncpu_slow >= 0 ? "nslow" : "fslow");
+	printf("%u\t%u\t%.0f %.1f\t%.1f %.1f\t%.1f %.1f\t%.2f\n",
+			TILE_SIZE * N,
+			toprint_slow,
+			timing_avg/1000,
+			timing_dev/1000,
+			(flop/timing_avg/1000.0f),
+			(flop/(timing_avg*timing_avg)/1000.f)*timing_dev,
+			energy_avg, energy_dev,
+			flop/1000000000./energy_avg);
+
+	for (m = 0; m < N; m++)
+		for (n = 0; n < N; n++)
+			starpu_data_unregister(A[m][n]);
+
+out:
+	starpu_shutdown();
+	return 0;
+}

+ 61 - 0
tests/energy/perfs.gp

@@ -0,0 +1,61 @@
+set term postscript eps enhanced color font ",20"
+set key top left
+set xlabel "frequency (MHz)"
+
+freq_min = 1200
+freq_fast = 3500
+power_min = 2
+power_fast = 8.2
+TRSM_DECAY = 0.5
+POTRF_DECAY = 0.5
+
+
+# Plot the power according to frequency (cubic curve)
+
+freq_min3 = freq_min * freq_min * freq_min
+freq_fast3 = freq_fast * freq_fast * freq_fast
+alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
+power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
+ 
+set output "power.eps"
+set ylabel "power (W)"
+
+plot [frequency=freq_min:freq_fast] [y=0:] power(frequency) lw 2 notitle
+
+
+# Plot the kernel performance according to frequency
+
+set output "perfs.eps"
+set ylabel "performance (GFlop/s)"
+
+gemm_max_perf = 50
+trsm_max_perf = 35.784040
+potrf_max_perf = 6.964803
+
+gemm_factor(frequency) = frequency / freq_fast
+trsm_factor(frequency) = (frequency - freq_min/2) ** TRSM_DECAY / (freq_fast - freq_min/2) ** TRSM_DECAY
+potrf_factor(frequency) = 1 - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2)))
+
+plot [frequency=freq_min:freq_fast] \
+     gemm_max_perf * gemm_factor(frequency) lw 2 title "gemm", \
+     trsm_max_perf * trsm_factor(frequency) lw 2 title "trsm", \
+     potrf_max_perf * potrf_factor(frequency) lw 2 title "potrf"
+
+
+# Plot the kernel efficiency according to frequency
+
+set output "efficiency.eps"
+set key top right
+set ylabel "efficiency (GFlop/W)"
+
+gemm_max_efficiency = 6.097561
+trsm_max_efficiency = 4.363907
+potrf_max_efficiency = 0.849366
+
+power_factor(frequency) = power(frequency) / power(freq_fast)
+
+plot [frequency=freq_min:freq_fast] \
+     gemm_max_efficiency * gemm_factor(frequency) / power_factor(frequency) lw 2 title "gemm", \
+     trsm_max_efficiency * trsm_factor(frequency) / power_factor(frequency)  lw 2 title "trsm", \
+     potrf_max_efficiency * potrf_factor(frequency) / power_factor(frequency)  lw 2 title "potrf"
+

+ 57 - 0
tests/energy/static.sh

@@ -0,0 +1,57 @@
+#!/bin/sh
+
+# To have 24 cores
+export STARPU_HOSTNAME=sirocco
+
+# To avoid slowing down simulation
+export MALLOC_PERTURB_=0
+
+# You can play with these
+export STARPU_FREQ_SLOW=1200
+export STARPU_POWER_SLOW=2
+export STARPU_POWER_FAST=8.2
+export N=40
+export NITER=30
+
+GAMMAS="1000000 100000 10000 0"
+
+for gamma in $GAMMAS; do
+	(for ncpu_slow in $(seq 0 24) ; do 
+		STARPU_SCHED_GAMMA=$gamma STARPU_NCPU_SLOW=$ncpu_slow \
+			./energy_efficiency $N $NITER | grep "^$(($N * 512))	" &
+	done) | sort -n -k 2 > static.$gamma.dat
+done
+
+cat > static.gp << EOF
+set output "static.eps"
+set term postscript eps enhanced color font ",20"
+set key top center
+set xlabel "performance (GFlop/s)"
+set ylabel "energy (J)"
+
+plot \\
+EOF
+for gamma in $GAMMAS; do
+	cat >> static.gp << EOF
+	"static.$gamma.dat" using 5:7:6:8 with xyerrorlines title "$gamma", \\
+EOF
+done
+
+cat >> static.gp << EOF
+
+set output "static-time.eps"
+set xlabel "time (ms)"
+set ylabel "energy (J)"
+
+plot \\
+EOF
+for gamma in $GAMMAS; do
+	cat >> static.gp << EOF
+	"static.$gamma.dat" using 3:7:4:8 with xyerrorlines title "$gamma", \\
+EOF
+done
+
+
+gnuplot static.gp
+gv static.eps &
+gv static-time.eps &