5 years ago · 4e71fdd318
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,12 @@ New features:
 
				     files. This file can be parsed by the new script
			
 
				     starpu_fxt_number_events_to_names.py to convert event keys to event names.
			
 
				   * New STARPU_PER_WORKER perfmodel.
			
 
				+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
			
 
				+    starpu_energy_used().
			
 
				 
			
 
				 Small changes:
			
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				+  * Add a synthetic energy efficiency testcase.
			
 
				 
			
 
				 StarPU 1.3.4 (git revision xxx)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -3519,6 +3519,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
			
 
				+  mkdir -p tests/energy
			
 
				+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
			
 
				+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
			
 
				   mkdir -p tests/datawizard
			
 
				   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
			
 
				   mkdir -p tests/overlap
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -251,6 +251,20 @@ void starpu_sleep(float nb_sec);
 
				   */
			
 
				 void starpu_usleep(float nb_micro_sec);
			
 
				 
			
 
				+/**
			
 
				+   Account for \p joules J being used.
			
 
				+   This is support in simgrid mode, to record how much energy was used, and will
			
 
				+   show up in further call to starpu_energy_used().
			
 
				+  */
			
 
				+void starpu_energy_use(float joules);
			
 
				+
			
 
				+/**
			
 
				+   Return the amount of energy having been used in J.
			
 
				+   This account the amounts passed to starpu_energy_use(), but also the static
			
 
				+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
			
 
				+  */
			
 
				+double starpu_energy_used(void);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -513,7 +513,7 @@ struct starpu_codelet
 
				 
			
 
				 	/**
			
 
				 	   Optional pointer to the task energy consumption performance
			
 
				-	   model associated to this codelet. This optional field is
			
 
				+	   model associated to this codelet (in J). This optional field is
			
 
				 	   ignored when set to <c>NULL</c> or when its field
			
 
				 	   starpu_perfmodel::symbol is not set. In the case of
			
 
				 	   parallel codelets, this has to account for all processing
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
				 extern void smpi_process_set_user_data(void *);
			
 
				 #endif
			
 
				 
			
 
				+static double _starpu_simgrid_dynamic_energy = 0.0;
			
 
				+
			
 
				 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
			
 
				  * initialized through MSG_process_attach */
			
 
				 static int simgrid_started;
			
@@ -629,6 +631,7 @@ struct task
 
				 #else
			
 
				 	msg_task_t task;
			
 
				 #endif
			
 
				+	double energy;
			
 
				 
			
 
				 	/* communication termination signalization */
			
 
				 	unsigned *finished;
			
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 
				 		MSG_task_execute(task->task);
			
 
				 		MSG_task_destroy(task->task);
			
 
				 #endif
			
 
				+		starpu_energy_use(task->energy);
			
 
				 		_STARPU_DEBUG("task %p finished\n", task);
			
 
				 
			
 
				 		*task->finished = 1;
			
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 
				 }
			
 
				 
			
 
				 /* Task execution submitted by StarPU */
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
			
 
				 {
			
 
				 	struct starpu_task *starpu_task = j->task;
			
 
				 	double flops;
			
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 
			
 
				 	if (isnan(length))
			
 
				 	{
			
 
				-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
			
 
				+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
			
 
				 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
			
 
				 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
			
 
				                 /* TODO: option to add variance according to performance model,
			
 
				                  * to be able to easily check scheduling robustness */
			
 
				 	}
			
 
				+	if (isnan(energy))
			
 
				+	{
			
 
				+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				+		/* TODO: option to add variance according to performance model,
			
 
				+		 * to be able to easily check scheduling robustness */
			
 
				+	}
			
 
				 
			
 
				 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
			
 
				 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
			
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 		MSG_task_execute(simgrid_task);
			
 
				 		MSG_task_destroy(simgrid_task);
			
 
				 #endif
			
 
				+		starpu_energy_use(energy);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 #else
			
 
				 		task->task = simgrid_task;
			
 
				 #endif
			
 
				+		task->energy = energy;
			
 
				 		task->finished = finished;
			
 
				 		*finished = 0;
			
 
				 		task->next = NULL;
			
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+void starpu_energy_use(float joules)
			
 
				+{
			
 
				+	_starpu_simgrid_dynamic_energy += joules;
			
 
				+}
			
 
				+
			
 
				+double starpu_energy_used(void)
			
 
				+{
			
 
				+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
			
 
				+}
			
 
				 
			
 
				 #endif
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 
				 void _starpu_simgrid_actor_setup(void);
			
 
				 void _starpu_simgrid_wait_tasks(int workerid);
			
 
				 struct _starpu_job;
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
			
 
				 struct _starpu_data_request;
			
 
				 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
			
 
				 union _starpu_async_channel_event;
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -108,7 +108,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 			else
			
 
				-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
			
 
				+			{
			
 
				+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
			
 
				+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
			
 
				+			}
			
 
				 #else
			
 
				 #  ifdef STARPU_PAPI
			
 
				 			_starpu_profiling_papi_task_start_counters(task);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -531,8 +531,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 		else
			
 
				-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
			
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
 
				+		}
			
 
				 #else
			
 
				 #ifdef HAVE_LIBNVIDIA_ML
			
 
				 		unsigned long long energy_start = 0;
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -948,6 +948,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 		_STARPU_TRACE_START_EXECUTING();
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		double length = NAN;
			
 
				+		double energy = NAN;
			
 
				 		int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
			
 
				 		int simulate = 1;
			
 
				 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
			
@@ -976,6 +977,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 #else
			
 
				 			length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
			
 
				 #endif
			
 
				+			energy = info->energy_consumed;
			
 
				 			/* And give the simulated time to simgrid */
			
 
				 			simulate = 1;
			
 
				 #endif
			
@@ -989,8 +991,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 			}
			
 
				 
			
 
				 		if (simulate)
			
 
				-			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+			_starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
			
 
				 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
			
 
				+		}
			
 
				 #else
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				 
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -41,12 +41,11 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				 		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				 		{
			
 
				 			if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				 			{
			
 
				-				double d = starpu_task_expected_length(task, archtype, impl);
			
 
				+				double d = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, impl);
			
 
				 				if(isnan(d))
			
 
				 				{
			
 
				 					best_impl = impl;
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -42,6 +42,9 @@ EXTRA_DIST =					\
 
				 	microbenchs/parallel_independent_heterogeneous_tasks.sh	\
			
 
				 	microbenchs/parallel_independent_homogeneous_tasks_data.sh	\
			
 
				 	microbenchs/parallel_independent_homogeneous_tasks.sh	\
			
 
				+	energy/static.sh			\
			
 
				+	energy/dynamic.sh			\
			
 
				+	energy/perfs.gp				\
			
 
				 	datawizard/scratch_opencl_kernel.cl     \
			
 
				 	datawizard/sync_and_notify_data_opencl_codelet.cl\
			
 
				 	datawizard/opencl_codelet_unsigned_inc_kernel.cl \
			
@@ -212,6 +215,7 @@ myPROGRAMS +=					\
 
				 
			
 
				 if STARPU_SIMGRID
			
 
				 myPROGRAMS +=					\
			
 
				+	energy/energy_efficiency		\
			
 
				 	datawizard/simgrid-locality
			
 
				 endif
			
 
				 
			
--- a/tests/energy/dynamic.sh
+++ b/tests/energy/dynamic.sh
@@ -0,0 +1,54 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# To have 24 cores
			
 
				+export STARPU_HOSTNAME=sirocco
			
 
				+
			
 
				+# To avoid slowing down simulation
			
 
				+export MALLOC_PERTURB_=0
			
 
				+
			
 
				+# You can play with these
			
 
				+export N=40
			
 
				+export NITER=30
			
 
				+
			
 
				+GAMMAS="1000000 100000 76000 10000 0"
			
 
				+
			
 
				+for gamma in $GAMMAS; do
			
 
				+	(for freq_slow in $(seq 1200 200 3500) ; do 
			
 
				+		STARPU_SCHED_GAMMA=$gamma STARPU_FREQ_SLOW=$freq_slow \
			
 
				+			./energy_efficiency $N $NITER | grep "^$(($N * 512))	" &
			
 
				+	done) | sort -n -k 2 > dynamic.$gamma.dat
			
 
				+done
			
 
				+
			
 
				+cat > dynamic.gp << EOF
			
 
				+set output "dynamic.eps"
			
 
				+set term postscript eps enhanced color font ",20"
			
 
				+set key bottom right
			
 
				+set xlabel "performance (GFlop/s)"
			
 
				+set ylabel "energy (J)"
			
 
				+
			
 
				+plot \\
			
 
				+EOF
			
 
				+for gamma in $GAMMAS; do
			
 
				+	cat >> dynamic.gp << EOF
			
 
				+	"dynamic.$gamma.dat" using 5:7:6:8 with xyerrorlines lw 2 title "$gamma", \\
			
 
				+EOF
			
 
				+done
			
 
				+
			
 
				+cat >> dynamic.gp << EOF
			
 
				+
			
 
				+set output "dynamic-time.eps"
			
 
				+set xlabel "time (ms)"
			
 
				+set ylabel "energy (J)"
			
 
				+
			
 
				+plot \\
			
 
				+EOF
			
 
				+for gamma in $GAMMAS; do
			
 
				+	cat >> dynamic.gp << EOF
			
 
				+	"dynamic.$gamma.dat" using 3:7:4:8 with xyerrorlines lw 2 title "$gamma", \\
			
 
				+EOF
			
 
				+done
			
 
				+
			
 
				+
			
 
				+gnuplot dynamic.gp
			
 
				+gv dynamic.eps &
			
 
				+gv dynamic-time.eps &
			
--- a/tests/energy/energy_efficiency.c
+++ b/tests/energy/energy_efficiency.c
@@ -0,0 +1,496 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2016-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2016       Bérangère Subervie
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdbool.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * This tries to run kernels with different efficiency depending on the core
			
 
				+ * frequency.
			
 
				+ *
			
 
				+ * This is based on the Cholesky factorization, which is made to exhibit three
			
 
				+ * caricatural cases as follows:
			
 
				+ *
			
 
				+ * - gemm: always get faster with higher frequency
			
 
				+ * - trsm: gets faster with higher frequency, but efficiency gets lower and
			
 
				+ * lower
			
 
				+ * - potrf: reaches a maximum performance, after which there is no point in
			
 
				+ * running it at higher frequency.
			
 
				+ *
			
 
				+ * We here assume that the power use is the same for the different kernels
			
 
				+ * (which wouldn't be true for real kernels, measurements would be needed, to
			
 
				+ * feed the performance models).
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+/* These are the different frequency and power parameters, as measured and
			
 
				+ * provided to this program */
			
 
				+static float freq_min, freq_fast;
			
 
				+static float power_min, power_fast;
			
 
				+
			
 
				+/*
			
 
				+ * This returns the dynamic power used by a CPU core in W at a given frequency
			
 
				+ * in MHz
			
 
				+ * This assumes C.V^2.F with V being proportional to F, thus C.F^3
			
 
				+
			
 
				+   freq_min = 1200
			
 
				+   freq_fast = 3500
			
 
				+   power_min = 2
			
 
				+   power_fast = 8.2
			
 
				+
			
 
				+   freq_min3 = freq_min * freq_min * freq_min
			
 
				+   freq_fast3 = freq_fast * freq_fast * freq_fast
			
 
				+   alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
			
 
				+   power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
			
 
				+   plot [frequency=freq_min:freq_fast] power(frequency) lw 2
			
 
				+
			
 
				+ */
			
 
				+static float power(float frequency)
			
 
				+{
			
 
				+	double freq_min3 = freq_min * freq_min * freq_min;
			
 
				+	double freq_fast3 = freq_fast * freq_fast * freq_fast;
			
 
				+	double alpha = (power_fast - power_min) / (freq_fast3 - freq_min3);
			
 
				+	return power_min + alpha * ( frequency*frequency*frequency - freq_min3);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * This returns the frequency of the given worker and implementation in MHz.
			
 
				+ * This is where we can tune either a given number of cores at a low frequency,
			
 
				+ * or which implementation uses which frequency. */
			
 
				+
			
 
				+/* These are the chosen parameters: how many cores get slowed down, at which
			
 
				+ * frequency */
			
 
				+static int ncpu_slow = -1;
			
 
				+static float freq_slow;
			
 
				+
			
 
				+static float frequency(int worker, unsigned i)
			
 
				+{
			
 
				+	if (ncpu_slow == -1)
			
 
				+	{
			
 
				+		/* Version that allows the runtime to switch speed between
			
 
				+		 * tasks, by exposing two implementations with different time
			
 
				+		 * and energy */
			
 
				+		if (i == 0)
			
 
				+			/* Slow implementation */
			
 
				+			return freq_slow;
			
 
				+		else
			
 
				+			/* Fast implementation */
			
 
				+			return freq_fast;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* Version that assumes that ncpu_slow workers are running at
			
 
				+		 * slow speed */
			
 
				+		if (worker < ncpu_slow)
			
 
				+			return freq_slow;
			
 
				+		else
			
 
				+			return freq_fast;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* This is from magma
			
 
				+
			
 
				+  -- Innovative Computing Laboratory
			
 
				+  -- Electrical Engineering and Computer Science Department
			
 
				+  -- University of Tennessee
			
 
				+  -- (C) Copyright 2009
			
 
				+
			
 
				+  Redistribution  and  use  in  source and binary forms, with or without
			
 
				+  modification,  are  permitted  provided  that the following conditions
			
 
				+  are met:
			
 
				+
			
 
				+  * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+    notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+  * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+    notice,  this list of conditions and the following disclaimer in the
			
 
				+    documentation  and/or other materials provided with the distribution.
			
 
				+  * Neither  the  name of the University of Tennessee, Knoxville nor the
			
 
				+    names of its contributors may be used to endorse or promote products
			
 
				+    derived from this software without specific prior written permission.
			
 
				+
			
 
				+  THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+  ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+  LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+  A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+  HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+  SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+  LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+  DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+  THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+  (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+  OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+  */
			
 
				+
			
 
				+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
			
 
				+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n)      ) * (double)(__n) - (1. / 6.)))
			
 
				+
			
 
				+#define FLOPS_SPOTRF(__n) (     FMULS_POTRF((__n)) +       FADDS_POTRF((__n)) )
			
 
				+
			
 
				+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
			
 
				+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
			
 
				+
			
 
				+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
			
 
				+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
			
 
				+
			
 
				+#define FMULS_TRSM FMULS_TRMM
			
 
				+#define FADDS_TRSM FMULS_TRMM
			
 
				+
			
 
				+#define FLOPS_STRSM(__m, __n) (     FMULS_TRSM((__m), (__n)) +       FADDS_TRSM((__m), (__n)) )
			
 
				+
			
 
				+
			
 
				+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
			
 
				+
			
 
				+#define FLOPS_SGEMM(__m, __n, __k) (     FMULS_GEMM((__m), (__n), (__k)) +       FADDS_GEMM((__m), (__n), (__k)) )
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* Tags for spotting tasks in the trace */
			
 
				+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
			
 
				+					| ((unsigned long long)(i)<<16)	\
			
 
				+					| (unsigned long long)(j))))
			
 
				+
			
 
				+/* Arbitrary tile size */
			
 
				+#define	TILE_SIZE	512
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * Kernel time performance models, would normally be provided by measurements
			
 
				+ */
			
 
				+
			
 
				+/* We assume that GEMM scales perfectly with frequency */
			
 
				+#define GEMM_GFLOPS 50.	/* At full speed */
			
 
				+#define GEMM_FLOPS(N) FLOPS_SGEMM(N, N, N)
			
 
				+#define GEMM_TIME(N) (GEMM_FLOPS(TILE_SIZE) / (GEMM_GFLOPS * 1000000000.))
			
 
				+static double _gemm_time(float frequency)
			
 
				+{
			
 
				+	double ret = GEMM_TIME(N);
			
 
				+
			
 
				+	/* Fix according to real frequency, linear */
			
 
				+	ret = GEMM_TIME(N) / (frequency / freq_fast);
			
 
				+	return ret * 1000000.;
			
 
				+}
			
 
				+
			
 
				+static double gemm_time(struct starpu_task *t, unsigned workerid, unsigned i)
			
 
				+{
			
 
				+	(void)t;
			
 
				+	return _gemm_time(frequency(workerid, i));
			
 
				+}
			
 
				+
			
 
				+/* We assume that TRSM decays a bit with frequency */
			
 
				+#define TRSM_DECAY 0.5
			
 
				+#define TRSM_FLOPS(N) FLOPS_STRSM(N, N)
			
 
				+static double _trsm_time(float frequency)
			
 
				+{
			
 
				+	double ret = GEMM_TIME(N)*0.7; /* as typically observed */
			
 
				+
			
 
				+	/* Fix according to real frequency, root */
			
 
				+	ret = ret / (pow(frequency - freq_min/2, TRSM_DECAY) / pow(freq_fast - freq_min/2, TRSM_DECAY));
			
 
				+	return ret * 1000000.;
			
 
				+}
			
 
				+
			
 
				+static double trsm_time(struct starpu_task *t, unsigned workerid, unsigned i)
			
 
				+{
			
 
				+	(void)t;
			
 
				+	return _trsm_time(frequency(workerid, i));
			
 
				+}
			
 
				+
			
 
				+/* We assume that POTRF decays strongly with frequency */
			
 
				+#define POTRF_DECAY 0.5
			
 
				+#define POTRF_FLOPS(N) FLOPS_SPOTRF(N)
			
 
				+static double _potrf_time(float frequency)
			
 
				+{
			
 
				+	double ret = GEMM_TIME(N)*1.2; /* as typically observed */
			
 
				+
			
 
				+	/* Fix according to real frequency, asymptote */
			
 
				+	ret = ret / (1. - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2))));
			
 
				+	return ret * 1000000.;
			
 
				+}
			
 
				+static double potrf_time(struct starpu_task *t, unsigned workerid, unsigned i)
			
 
				+{
			
 
				+	(void)t;
			
 
				+	return _potrf_time(frequency(workerid, i));
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* stub for kernel, shouldn't be getting called in simgrid mode */
			
 
				+void dummy_func(void *descr[], void *_args)
			
 
				+{
			
 
				+	(void)descr; (void)_args;
			
 
				+	fprintf(stderr, "?? shouldn't be called\n");
			
 
				+}
			
 
				+
			
 
				+/* Define the codelets */
			
 
				+#define CODELET(kernel, nb, ...) \
			
 
				+static double kernel##_energy(struct starpu_task *t, unsigned workerid, unsigned i) \
			
 
				+{ \
			
 
				+	double time = kernel##_time(t, workerid, i); \
			
 
				+	return power(frequency(workerid, i)) * time / 1000000.; \
			
 
				+} \
			
 
				+\
			
 
				+static struct starpu_perfmodel kernel##_perf_model = \
			
 
				+{ \
			
 
				+	.symbol = #kernel, \
			
 
				+	.type = STARPU_PER_WORKER, \
			
 
				+	.worker_cost_function = kernel##_time, \
			
 
				+}; \
			
 
				+\
			
 
				+static struct starpu_perfmodel kernel##_energy_model = \
			
 
				+{ \
			
 
				+	.symbol = #kernel "_energy", \
			
 
				+	.type = STARPU_PER_WORKER, \
			
 
				+	.worker_cost_function = kernel##_energy, \
			
 
				+}; \
			
 
				+\
			
 
				+static struct starpu_codelet kernel##_cl = \
			
 
				+{ \
			
 
				+	.cpu_funcs = { dummy_func }, \
			
 
				+	.nbuffers = nb, \
			
 
				+	.modes = {__VA_ARGS__}, \
			
 
				+	.model = &kernel##_perf_model, \
			
 
				+	.energy_model = &kernel##_energy_model, \
			
 
				+};
			
 
				+
			
 
				+CODELET(potrf, 1, STARPU_RW)
			
 
				+CODELET(trsm, 2, STARPU_R, STARPU_RW)
			
 
				+CODELET(gemm, 3, STARPU_R, STARPU_R, STARPU_RW)
			
 
				+
			
 
				+
			
 
				+int main(int argc, char *argv[]) {
			
 
				+	/* Initialize environment variables */
			
 
				+
			
 
				+	if (!getenv("STARPU_IDLE_POWER"))
			
 
				+		setenv("STARPU_IDLE_POWER", "30", 1);
			
 
				+	const char *hostname = getenv("STARPU_HOSTNAME");
			
 
				+	if (!hostname || strcmp(hostname, "sirocco"))
			
 
				+	{
			
 
				+		printf("Warning: This is expected to be run with export STARPU_HOSTNAME=sirocco\n");
			
 
				+	}
			
 
				+
			
 
				+	freq_min =  starpu_get_env_number_default("STARPU_FREQ_MIN", 1200);
			
 
				+	freq_slow =  starpu_get_env_number_default("STARPU_FREQ_SLOW", 1200);
			
 
				+	freq_fast =  starpu_get_env_number_default("STARPU_FREQ_FAST", 3500);
			
 
				+
			
 
				+	power_min =  starpu_get_env_float_default("STARPU_POWER_MIN", 2);
			
 
				+	power_fast =  starpu_get_env_float_default("STARPU_POWER_FAST", 8.2);
			
 
				+
			
 
				+	/* Number of slow CPU cores */
			
 
				+	ncpu_slow = starpu_get_env_number_default("STARPU_NCPU_SLOW", -1);
			
 
				+	if (ncpu_slow == -1)
			
 
				+	{
			
 
				+		/* Enable second implementation.  */
			
 
				+		potrf_cl.cpu_funcs[1] = dummy_func;
			
 
				+		trsm_cl.cpu_funcs[1] = dummy_func;
			
 
				+		gemm_cl.cpu_funcs[1] = dummy_func;
			
 
				+	}
			
 
				+
			
 
				+	/* Initialize StarPU */
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+
			
 
				+	conf.ncuda = 0;
			
 
				+	conf.nopencl = 0;
			
 
				+	if (!getenv("STARPU_SCHED"))
			
 
				+		conf.sched_policy_name = "dmdas";
			
 
				+
			
 
				+	int ret = starpu_initialize(&conf, &argc, &argv);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	unsigned N, k, m, n, iter, NITER;
			
 
				+	if (argc < 2)
			
 
				+		N = 40;
			
 
				+	else
			
 
				+		N = atoi(argv[1]);
			
 
				+	if (argc < 3)
			
 
				+		NITER = 10;
			
 
				+	else
			
 
				+		NITER = atoi(argv[2]);
			
 
				+	if (N == 0)
			
 
				+	{
			
 
				+		starpu_shutdown();
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/* Give parameter summary to user */
			
 
				+
			
 
				+	printf("freqs (MHz):\n");
			
 
				+	printf("%f %f %f\n", freq_min, freq_slow, freq_fast);
			
 
				+	printf("\n");
			
 
				+
			
 
				+	printf("per-core power (W):\n");
			
 
				+	printf("%f %f\n", power_min, power_fast);
			
 
				+	printf("%f %f %f\n", power(freq_min), power(freq_slow), power(freq_fast));
			
 
				+	printf("\n");
			
 
				+
			
 
				+	printf("kernel perfs in GFlops (min, slow, fast):\n");
			
 
				+	printf("gemm:\t%f %f %f\n",
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000,
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000,
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000);
			
 
				+
			
 
				+	printf("trsm:\t%f %f %f\n",
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000,
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000,
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000);
			
 
				+
			
 
				+	printf("potrf:\t%f %f %f\n",
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000,
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000,
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000);
			
 
				+	printf("\n");
			
 
				+
			
 
				+	printf("kernel efficiency in GFlops/W (min, slow, fast):\n");
			
 
				+	printf("gemm:\t%f %f %f\n",
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_min) / 1000 / power(freq_min),
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_slow) / 1000 / power(freq_slow),
			
 
				+			GEMM_FLOPS(TILE_SIZE) / _gemm_time(freq_fast) / 1000 / power(freq_fast));
			
 
				+
			
 
				+	printf("trsm:\t%f %f %f\n",
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_min) / 1000 / power(freq_min),
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_slow) / 1000 / power(freq_slow),
			
 
				+			TRSM_FLOPS(TILE_SIZE) / _trsm_time(freq_fast) / 1000 / power(freq_fast));
			
 
				+
			
 
				+	printf("potrf:\t%f %f %f\n",
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_min) / 1000 / power(freq_min),
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_slow) / 1000 / power(freq_slow),
			
 
				+			POTRF_FLOPS(TILE_SIZE) / _potrf_time(freq_fast) / 1000 / power(freq_fast));
			
 
				+	printf("\n");
			
 
				+
			
 
				+
			
 
				+	/* Now compute */
			
 
				+
			
 
				+	starpu_data_handle_t A[N][N];
			
 
				+
			
 
				+	for (m = 0; m < N; m++)
			
 
				+		for (n = 0; n < N; n++)
			
 
				+			starpu_void_data_register(&A[m][n]);
			
 
				+
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	double timing_sum = 0.;
			
 
				+	double energy_sum = 0.;
			
 
				+	double timing_sum2 = 0.;
			
 
				+	double energy_sum2 = 0.;
			
 
				+
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		double start = starpu_timing_now();
			
 
				+		double start_energy = starpu_energy_used();
			
 
				+
			
 
				+		for (k = 0; k < N; k++)
			
 
				+		{
			
 
				+			starpu_iteration_push(k);
			
 
				+			ret = starpu_task_insert(&potrf_cl,
			
 
				+						 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k) : STARPU_MAX_PRIO,
			
 
				+						 STARPU_RW, A[k][k],
			
 
				+						 STARPU_FLOPS, (double) FLOPS_SPOTRF(TILE_SIZE),
			
 
				+						 STARPU_TAG_ONLY, TAG11(k),
			
 
				+						 0);
			
 
				+			if (ret == -ENODEV) return 77;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+			for (m = k+1; m<N; m++)
			
 
				+			{
			
 
				+				ret = starpu_task_insert(&trsm_cl,
			
 
				+							 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							 STARPU_R, A[k][k],
			
 
				+							 STARPU_RW, A[m][k],
			
 
				+							 STARPU_FLOPS, (double) FLOPS_STRSM(TILE_SIZE, TILE_SIZE),
			
 
				+							 STARPU_TAG_ONLY, TAG21(m,k),
			
 
				+							 0);
			
 
				+				if (ret == -ENODEV) return 77;
			
 
				+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+			}
			
 
				+
			
 
				+			for (m = k+1; m<N; m++)
			
 
				+			{
			
 
				+				for (n = k+1; n<N; n++)
			
 
				+				{
			
 
				+					if (n <= m)
			
 
				+					{
			
 
				+						ret = starpu_task_insert(&gemm_cl,
			
 
				+									 STARPU_PRIORITY, unbound_prio ? (int)(2*N - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+									 STARPU_R, A[m][k],
			
 
				+									 STARPU_R, A[n][k],
			
 
				+									 gemm_cl.modes[2], A[m][n],
			
 
				+									 STARPU_FLOPS, (double) FLOPS_SGEMM(TILE_SIZE, TILE_SIZE, TILE_SIZE),
			
 
				+									 STARPU_TAG_ONLY, TAG22(k,m,n),
			
 
				+									 0);
			
 
				+						if (ret == -ENODEV) return 77;
			
 
				+						STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+			starpu_iteration_pop();
			
 
				+		}
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+		double end = starpu_timing_now();
			
 
				+		double end_energy = starpu_energy_used();
			
 
				+		double timing = end - start;
			
 
				+		double energy = end_energy - start_energy;
			
 
				+		timing_sum += timing;
			
 
				+		timing_sum2 += timing*timing;
			
 
				+		energy_sum += energy;
			
 
				+		energy_sum2 += energy*energy;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	/* Make stats and print */
			
 
				+
			
 
				+	double timing_avg = timing_sum / NITER;
			
 
				+	double timing_dev = sqrt((fabs(timing_sum2 - (timing_sum*timing_sum)/NITER))/NITER);
			
 
				+	double energy_avg = energy_sum / NITER;
			
 
				+	double energy_dev = sqrt((fabs(energy_sum2 - (energy_sum*energy_sum)/NITER))/NITER);
			
 
				+	double flop = FLOPS_SPOTRF(TILE_SIZE * N);
			
 
				+
			
 
				+	unsigned toprint_slow;
			
 
				+	if (ncpu_slow >= 0)
			
 
				+		toprint_slow = ncpu_slow;
			
 
				+	else
			
 
				+		toprint_slow = freq_slow;
			
 
				+
			
 
				+	printf("# size\t%s\tms +-\tGFlop/s +-\ten. (J) +-\tGF/W\n",
			
 
				+			ncpu_slow >= 0 ? "nslow" : "fslow");
			
 
				+	printf("%u\t%u\t%.0f %.1f\t%.1f %.1f\t%.1f %.1f\t%.2f\n",
			
 
				+			TILE_SIZE * N,
			
 
				+			toprint_slow,
			
 
				+			timing_avg/1000,
			
 
				+			timing_dev/1000,
			
 
				+			(flop/timing_avg/1000.0f),
			
 
				+			(flop/(timing_avg*timing_avg)/1000.f)*timing_dev,
			
 
				+			energy_avg, energy_dev,
			
 
				+			flop/1000000000./energy_avg);
			
 
				+
			
 
				+	for (m = 0; m < N; m++)
			
 
				+		for (n = 0; n < N; n++)
			
 
				+			starpu_data_unregister(A[m][n]);
			
 
				+
			
 
				+out:
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
--- a/tests/energy/perfs.gp
+++ b/tests/energy/perfs.gp
@@ -0,0 +1,61 @@
 
				+set term postscript eps enhanced color font ",20"
			
 
				+set key top left
			
 
				+set xlabel "frequency (MHz)"
			
 
				+
			
 
				+freq_min = 1200
			
 
				+freq_fast = 3500
			
 
				+power_min = 2
			
 
				+power_fast = 8.2
			
 
				+TRSM_DECAY = 0.5
			
 
				+POTRF_DECAY = 0.5
			
 
				+
			
 
				+
			
 
				+# Plot the power according to frequency (cubic curve)
			
 
				+
			
 
				+freq_min3 = freq_min * freq_min * freq_min
			
 
				+freq_fast3 = freq_fast * freq_fast * freq_fast
			
 
				+alpha = (power_fast - power_min) / (freq_fast3 - freq_min3)
			
 
				+power(frequency) = power_min + alpha * (frequency*frequency*frequency - freq_min3)
			
 
				+ 
			
 
				+set output "power.eps"
			
 
				+set ylabel "power (W)"
			
 
				+
			
 
				+plot [frequency=freq_min:freq_fast] [y=0:] power(frequency) lw 2 notitle
			
 
				+
			
 
				+
			
 
				+# Plot the kernel performance according to frequency
			
 
				+
			
 
				+set output "perfs.eps"
			
 
				+set ylabel "performance (GFlop/s)"
			
 
				+
			
 
				+gemm_max_perf = 50
			
 
				+trsm_max_perf = 35.784040
			
 
				+potrf_max_perf = 6.964803
			
 
				+
			
 
				+gemm_factor(frequency) = frequency / freq_fast
			
 
				+trsm_factor(frequency) = (frequency - freq_min/2) ** TRSM_DECAY / (freq_fast - freq_min/2) ** TRSM_DECAY
			
 
				+potrf_factor(frequency) = 1 - POTRF_DECAY * ((freq_min/(frequency-freq_min/2)) - (freq_min/(freq_fast-freq_min/2)))
			
 
				+
			
 
				+plot [frequency=freq_min:freq_fast] \
			
 
				+     gemm_max_perf * gemm_factor(frequency) lw 2 title "gemm", \
			
 
				+     trsm_max_perf * trsm_factor(frequency) lw 2 title "trsm", \
			
 
				+     potrf_max_perf * potrf_factor(frequency) lw 2 title "potrf"
			
 
				+
			
 
				+
			
 
				+# Plot the kernel efficiency according to frequency
			
 
				+
			
 
				+set output "efficiency.eps"
			
 
				+set key top right
			
 
				+set ylabel "efficiency (GFlop/W)"
			
 
				+
			
 
				+gemm_max_efficiency = 6.097561
			
 
				+trsm_max_efficiency = 4.363907
			
 
				+potrf_max_efficiency = 0.849366
			
 
				+
			
 
				+power_factor(frequency) = power(frequency) / power(freq_fast)
			
 
				+
			
 
				+plot [frequency=freq_min:freq_fast] \
			
 
				+     gemm_max_efficiency * gemm_factor(frequency) / power_factor(frequency) lw 2 title "gemm", \
			
 
				+     trsm_max_efficiency * trsm_factor(frequency) / power_factor(frequency)  lw 2 title "trsm", \
			
 
				+     potrf_max_efficiency * potrf_factor(frequency) / power_factor(frequency)  lw 2 title "potrf"
			
 
				+
			
--- a/tests/energy/static.sh
+++ b/tests/energy/static.sh
@@ -0,0 +1,57 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+# To have 24 cores
			
 
				+export STARPU_HOSTNAME=sirocco
			
 
				+
			
 
				+# To avoid slowing down simulation
			
 
				+export MALLOC_PERTURB_=0
			
 
				+
			
 
				+# You can play with these
			
 
				+export STARPU_FREQ_SLOW=1200
			
 
				+export STARPU_POWER_SLOW=2
			
 
				+export STARPU_POWER_FAST=8.2
			
 
				+export N=40
			
 
				+export NITER=30
			
 
				+
			
 
				+GAMMAS="1000000 100000 10000 0"
			
 
				+
			
 
				+for gamma in $GAMMAS; do
			
 
				+	(for ncpu_slow in $(seq 0 24) ; do 
			
 
				+		STARPU_SCHED_GAMMA=$gamma STARPU_NCPU_SLOW=$ncpu_slow \
			
 
				+			./energy_efficiency $N $NITER | grep "^$(($N * 512))	" &
			
 
				+	done) | sort -n -k 2 > static.$gamma.dat
			
 
				+done
			
 
				+
			
 
				+cat > static.gp << EOF
			
 
				+set output "static.eps"
			
 
				+set term postscript eps enhanced color font ",20"
			
 
				+set key top center
			
 
				+set xlabel "performance (GFlop/s)"
			
 
				+set ylabel "energy (J)"
			
 
				+
			
 
				+plot \\
			
 
				+EOF
			
 
				+for gamma in $GAMMAS; do
			
 
				+	cat >> static.gp << EOF
			
 
				+	"static.$gamma.dat" using 5:7:6:8 with xyerrorlines title "$gamma", \\
			
 
				+EOF
			
 
				+done
			
 
				+
			
 
				+cat >> static.gp << EOF
			
 
				+
			
 
				+set output "static-time.eps"
			
 
				+set xlabel "time (ms)"
			
 
				+set ylabel "energy (J)"
			
 
				+
			
 
				+plot \\
			
 
				+EOF
			
 
				+for gamma in $GAMMAS; do
			
 
				+	cat >> static.gp << EOF
			
 
				+	"static.$gamma.dat" using 3:7:4:8 with xyerrorlines title "$gamma", \\
			
 
				+EOF
			
 
				+done
			
 
				+
			
 
				+
			
 
				+gnuplot static.gp
			
 
				+gv static.eps &
			
 
				+gv static-time.eps &