Bladeren bron

Add papi- and nvml-based energy measurement

Mariem makni 4 jaren geleden
bovenliggende
commit
f14fe3bd4b

+ 2 - 0
.gitignore

@@ -20,6 +20,8 @@
 ,*
 .libs
 .deps
+*.orig
+*.rej
 *.o
 *.lo
 *.la

+ 1 - 0
ChangeLog

@@ -46,6 +46,7 @@ New features:
   * Add profiling based on papi performance counters.
   * Add an experimental python interface (not actually parallel yet)
   * Add task submission file+line in traces.
+  * Add papi- and nvml-based energy measurement.
 
 Small changes:
   * Add a synthetic energy efficiency testcase.

+ 7 - 4
configure.ac

@@ -2110,10 +2110,6 @@ if test x$use_fxt = xyes; then
 		FXT_LIBS="$(pkg-config --variable=libdir fxt)/libfxt.a -Wl,--as-needed $(pkg-config --libs --static fxt) -Wl,--no-as-needed"
 	fi
 
-	AC_CHECK_LIB([papi], [PAPI_library_init],
-		     [AC_DEFINE([STARPU_PAPI], [1], [Define to 1 if you have the libpapi library])
-		      PAPI_LIBS=-lpapi])
-
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################
@@ -2141,6 +2137,13 @@ if  test x$enable_fxt_lock = xyes; then
 	AC_DEFINE(STARPU_FXT_LOCK_TRACES, [1], [enable additional locking systems FxT traces])
 fi
 
+AC_CHECK_LIB([papi], [PAPI_library_init],
+	     [AC_DEFINE([STARPU_PAPI], [1], [Define to 1 if you have the libpapi library])
+	      PAPI_LIBS=-lpapi])
+AC_SUBST(PAPI_LIBS)
+
+AM_CONDITIONAL([STARPU_USE_PAPI], [test "x$PAPI_LIBS" != "x"])
+
 AC_MSG_CHECKING(whether performance debugging should be enabled)
 AC_ARG_ENABLE(perf-debug, [AS_HELP_STRING([--enable-perf-debug],
 			[enable performance debugging through gprof])],

File diff suppressed because it is too large
+ 58 - 0
doc/doxygen/chapters/320_scheduling.doxy


+ 33 - 2
include/starpu_perfmodel.h

@@ -319,6 +319,21 @@ void starpu_perfmodel_init(struct starpu_perfmodel *model);
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 /**
+   starpu_energy_start - start counting hardware events in an event set
+*/
+
+int starpu_energy_start(enum starpu_worker_archtype archi);
+
+/**
+   starpu_energy_stop - stop counting hardware events in an event set
+   \values -- an array to hold the counter values of the counting events
+   \EventSet -- an integer handle for a PAPI event set as created by papi_create_eventset()
+*/
+
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi);
+
+
+/**
    Load the performance model found in the file named \p filename. \p model has to be
    completely zero, and will be filled with the information stored in the given file.
 */
@@ -414,16 +429,32 @@ int starpu_perfmodel_print_estimations(struct starpu_perfmodel *model, uint32_t
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model);
 
 /**
-   Feed the performance model model with an explicit
-   measurement measured (in µs), in addition to measurements done by StarPU
+   Feed the performance model \p model with one explicit
+   measurement (in µs or J), in addition to measurements done by StarPU
    itself. This can be useful when the application already has an
    existing set of measurements done in good conditions, that StarPU
    could benefit from instead of doing on-line measurements. An example
    of use can be seen in \ref PerformanceModelExample.
+
+   Note that this records only one measurement, and StarPU would ignore
+   the first measurement (since it is usually disturbed by library loading
+   etc.). Make sure to call this function several times to record all your
+   measurements.
+
+   You can also call starpu_perfmodel_update_history_n() to directly provide an
+   average performed on several tasks.
 */
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double measured);
 
 /**
+   Feed the performance model \p model with an explicit average measurement (in µs or J).
+
+   This is similar to starpu_perfmodel_update_history(), but records a batch of
+   \p number measurements provided as the average of the measurements \p average_measured.
+*/
+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch *arch, unsigned cpuid, unsigned nimpl, double average_measured, unsigned number);
+
+/**
    Print the directory name storing performance models on \p output
 */
 void starpu_perfmodel_directory(FILE *output);

+ 1 - 0
src/Makefile.am

@@ -196,6 +196,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	core/disk_ops/disk_unistd.c                             \
 	core/disk_ops/unistd/disk_unistd_global.c		\
 	core/perfmodel/perfmodel_history.c			\
+        core/perfmodel/energy_model.c                           \
 	core/perfmodel/perfmodel_bus.c				\
 	core/perfmodel/perfmodel.c				\
 	core/perfmodel/perfmodel_print.c			\

+ 235 - 0
src/core/perfmodel/energy_model.c

@@ -0,0 +1,235 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2008-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <papi.h>
+#include "hwloc.h"
+#include <starpu_perfmodel.h>
+#include <starpu_profiling.h>
+#include <common/config.h>
+#include <common/utils.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <sys/stat.h>
+#include <core/perfmodel/perfmodel.h>
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <datawizard/datawizard.h>
+#include <core/task.h>
+
+#ifdef STARPU_USE_CUDA
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+#include <nvml.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+#endif
+
+#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); } while (0)
+
+#if 1
+#define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
+#else
+#define debug(fmt, ...)
+#endif
+
+static const int N_EVTS = 2;
+
+static int nsockets;
+
+static const char* event_names[] = { "rapl::RAPL_ENERGY_PKG:cpu=%d",
+
+                              "rapl::RAPL_ENERGY_DRAM:cpu=%d"};
+
+static int add_event(int EventSet, int socket);
+
+/* PAPI variables*/
+
+/*must be initialized to PAPI_NULL before calling PAPI_create_event*/
+static int EventSet = PAPI_NULL;
+
+/*This is where we store the values we read from the eventset */
+static long long *values;
+
+static double t1;
+
+#ifdef STARPU_USE_CUDA
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+static unsigned long long energy_begin, energy_end;
+static nvmlDevice_t device;
+#endif
+#endif
+
+int starpu_energy_start(enum starpu_worker_archtype archi)
+{
+	int retval, number;
+	int i;
+
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	hwloc_topology_t topology = config->topology.hwtopology;
+
+	nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+
+	switch (archi) {
+	case STARPU_CPU_WORKER:
+
+		values=calloc(nsockets * N_EVTS,sizeof(long long));
+		STARPU_ASSERT(values);
+
+		if((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT )
+			ERROR_RETURN(retval);
+
+		/* Creating the eventset */
+		if ( (retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		for (i = 0 ; i < nsockets ; i ++ )
+		{
+			/* return the index of socket */
+			hwloc_obj_t obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PACKAGE, i);
+			add_event(EventSet, obj->os_index);
+		}
+
+		/* get the number of events in the event set */
+		number = 0;
+		if ( (retval = PAPI_list_events(EventSet, NULL, &number)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		debug("There are %d events in the event set\n", number);
+
+		/* Start counting */
+		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		t1 = starpu_timing_now();
+		break;
+
+
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+	case STARPU_CUDA_WORKER:
+	{
+		int ret = nvmlDeviceGetHandleByIndex_v2 (0,  &device);
+		ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_begin );
+	}
+	break;
+#endif
+
+	default:
+		printf("Error: worker is not supported ! \n");
+	break;
+	}
+
+
+	return retval;
+}
+
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi)
+{
+	double energy = 0.;
+
+	int retval;
+	unsigned workerid = 0;
+	unsigned cpuid = 0;
+	double t2 = starpu_timing_now();
+	double t = t2 - t1;
+	switch (archi) {
+	case STARPU_CPU_WORKER:
+
+		/* Stop counting and store the values into the array */
+		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		int k,s;
+
+		for( s = 0 ; s < nsockets ; s ++){
+			for(k = 0 ; k < N_EVTS; k++) {
+				energy += values[s * N_EVTS + k];
+
+				debug("%-40s%12.6f J\t(for %f us, Average Power %.1fW)\n",
+					event_names[k],
+					(energy*0.23/1.0e9),
+					t,
+					((energy*0.23/1.0e9)/(t*1.0E-6))
+				);
+			}
+		}
+
+		energy = energy * 0.23 / 1.0e9 / ntasks;
+
+		struct starpu_perfmodel_arch *arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
+
+		starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
+
+		/*removes all events from a PAPI event set */
+		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		/*deallocates the memory associated with an empty PAPI EventSet*/
+		if ( (retval = PAPI_destroy_eventset(&EventSet)) != PAPI_OK)
+			ERROR_RETURN(retval);
+
+		break;
+
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
+	case STARPU_CUDA_WORKER:
+	{
+		int ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_end );
+		debug("energy consumption on device %d is %lld mJ \n", 0, (energy_end - energy_begin));
+		break;
+	}
+#endif
+
+	default:
+		printf("Error: worker type %d is not supported! \n", archi);
+		break;
+
+	}
+
+	return retval;
+
+}
+
+static int add_event(int eventSet, int socket)
+{
+	int retval, i;
+	for (i = 0; i < N_EVTS; i++) {
+		char buf[255];
+		int code;
+
+		PAPI_event_info_t info;
+		sprintf(buf,  event_names[i], socket);
+		retval = PAPI_event_name_to_code( buf, &code);
+
+		retval = PAPI_get_event_info(code, &info);
+		retval = PAPI_add_event(eventSet, code);
+		if (retval != PAPI_OK) {
+			/* printf("Activating multiplex\n"); */
+			/* retval = PAPI_set_multiplex(eventSet); */
+			/* if(retval != PAPI_OK) { */
+			/*      printf("cannot set multiplex\n"); */
+			/*      exit (0); */
+			/* } */
+			retval = PAPI_add_named_event(eventSet, buf);
+			if(retval != PAPI_OK) {
+				printf("cannot add event\n");
+				exit (1);
+			}
+		}
+	}
+
+	return ( PAPI_OK );
+}

+ 1 - 1
src/core/perfmodel/perfmodel.h

@@ -81,7 +81,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 double _starpu_multiple_regression_based_job_expected_perf(struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,
 					struct _starpu_job *j, unsigned nimpl);
 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch * arch,
-				unsigned cpuid, double measured, unsigned nimpl);
+					unsigned cpuid, double measured, unsigned nimpl, unsigned number);
 int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch);
 
 void _starpu_create_sampling_directory_if_needed(void);

+ 17 - 12
src/core/perfmodel/perfmodel_history.c

@@ -804,7 +804,7 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 	/* Dump the history into the model file in case it is necessary */
        if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED || model->type == STARPU_REGRESSION_BASED)
 	{
-		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us)\tdev (us)\tsum\t\tsum2\t\tn\n");
+		fprintf(f, "# hash\t\tsize\t\tflops\t\tmean (us or J)\tdev (us or J)\tsum\t\tsum2\t\tn\n");
 		ptr = per_arch_model->list;
 		while (ptr)
 		{
@@ -1839,7 +1839,7 @@ int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch)
 	return comb;
 }
 
-void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
+void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl, unsigned number)
 {
 	STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
 	if (model)
@@ -1909,11 +1909,11 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 				/* For history-based, do not take the first measurement into account, it is very often quite bogus */
 				/* TODO: it'd be good to use a better estimation heuristic, like the median, or latest n values, etc. */
-				if (model->type != STARPU_HISTORY_BASED)
+				if (number != 1 || model->type != STARPU_HISTORY_BASED)
 				{
-					entry->sum = measured;
-					entry->sum2 = measured*measured;
-					entry->nsample = 1;
+					entry->sum = measured * number;
+					entry->sum2 = measured*measured * number;
+					entry->nsample = number;
 					entry->mean = measured;
 				}
 
@@ -1934,7 +1934,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 					(100 * local_deviation > (100 + historymaxerror)
 					 || (100 / local_deviation > (100 + historymaxerror))))
 				{
-					entry->nerror++;
+					entry->nerror+=number;
 
 					/* More errors than measurements, we're most probably completely wrong, we flush out all the entries */
 					if (entry->nerror >= entry->nsample)
@@ -1952,9 +1952,9 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 				}
 				else
 				{
-					entry->sum += measured;
-					entry->sum2 += measured*measured;
-					entry->nsample++;
+					entry->sum += measured * number;
+					entry->sum2 += measured*measured * number;
+					entry->nsample += number;
 
 					unsigned n = entry->nsample;
 					entry->mean = entry->sum / n;
@@ -2070,7 +2070,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 	}
 }
 
-void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
+void starpu_perfmodel_update_history_n(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured, unsigned number)
 {
 	struct _starpu_job *job = _starpu_get_job_associated_to_task(task);
 
@@ -2080,11 +2080,16 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
 	_starpu_init_and_load_perfmodel(model);
 	/* Record measurement */
-	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl);
+	_starpu_update_perfmodel_history(job, model, arch, cpuid, measured, nimpl, number);
 	/* and save perfmodel on termination */
 	_starpu_set_calibrate_flag(1);
 }
 
+void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *task, struct starpu_perfmodel_arch * arch, unsigned cpuid, unsigned nimpl, double measured)
+{
+	starpu_perfmodel_update_history_n(model, task, arch, cpuid, nimpl, measured, 1);
+}
+
 int starpu_perfmodel_list_combs(FILE *output, struct starpu_perfmodel *model)
 {
 	int comb;

+ 4 - 0
src/core/sched_policy.c

@@ -417,6 +417,10 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 
 int _starpu_push_task(struct _starpu_job *j)
 {
+#ifdef STARPU_SIMGRID
+	//if (_starpu_simgrid_task_push_cost())
+		starpu_sleep(0.000001);
+#endif
 	if(j->task->prologue_callback_func)
 	{
 		_starpu_set_current_task(j->task);

+ 2 - 2
src/drivers/driver_common/driver_common.c

@@ -276,7 +276,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 				do_update_time_model = 0;
 			if (do_update_time_model)
 			{
-				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl);
+				_starpu_update_perfmodel_history(j, j->task->cl->model, perf_arch, worker->devid, time_consumed, j->nimpl, 1);
 			}
 		}
 	}
@@ -312,7 +312,7 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 			do_update_energy_model = 0;
 		if (do_update_energy_model)
 		{
-			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl);
+			_starpu_update_perfmodel_history(j, j->task->cl->energy_model, perf_arch, worker->devid, energy_consumed, j->nimpl, 1);
 		}
 	}
 }

+ 116 - 0
tests/perfmodels/regression_based_memset.c

@@ -19,6 +19,8 @@
 #include <starpu_scheduler.h>
 #include "../helper.h"
 
+#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); }
+
 /*
  * Benchmark memset with a linear and non-linear regression
  */
@@ -27,8 +29,10 @@
 #define START 1024
 #ifdef STARPU_QUICK_CHECK
 #define END 1048576
+#define NENERGY 3
 #else
 #define END 16777216
+#define NENERGY 100
 #endif
 
 #ifdef STARPU_USE_CUDA
@@ -85,6 +89,18 @@ static struct starpu_perfmodel nl_model =
 	.symbol = "non_linear_memset_regression_based"
 };
 
+static struct starpu_perfmodel energy_model =
+{
+	.type = STARPU_REGRESSION_BASED,
+	.symbol = "memset_regression_based_energy"
+};
+
+static struct starpu_perfmodel nl_energy_model =
+{
+	.type = STARPU_NL_REGRESSION_BASED,
+	.symbol = "non_linear_memset_regression_based_energy"
+};
+
 static struct starpu_codelet memset_cl =
 {
 #ifdef STARPU_USE_CUDA
@@ -98,6 +114,7 @@ static struct starpu_codelet memset_cl =
 	.cpu_funcs = {memset0_cpu, memset_cpu},
 	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
 	.model = &model,
+	.energy_model = &energy_model,
 	.nbuffers = 1,
 	.modes = {STARPU_W}
 };
@@ -115,6 +132,7 @@ static struct starpu_codelet nl_memset_cl =
 	.cpu_funcs = {memset0_cpu, memset_cpu},
 	.cpu_funcs_name = {"memset0_cpu", "memset_cpu"},
 	.model = &nl_model,
+	.energy_model = &nl_energy_model,
 	.nbuffers = 1,
 	.modes = {STARPU_W}
 };
@@ -142,6 +160,68 @@ static void test_memset(int nelems, struct starpu_codelet *codelet)
         starpu_data_unregister(handle);
 }
 
+static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+{
+	int nloops = starpu_worker_get_count_by_type(archtype) * NENERGY;
+	int loop;
+	starpu_data_handle_t handle[nloops];
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		starpu_vector_data_register(&handle[loop], -1, (uintptr_t)NULL, nelems, sizeof(int));
+
+		task->cl = codelet;
+		task->where = where;
+		task->handles[0] = handle[loop];
+		task->flops = nelems;
+
+		int ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+			exit(STARPU_TEST_SKIPPED);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	for (loop = 0; loop < nloops; loop++)
+	{
+		starpu_data_unregister(handle[loop]);
+	}
+
+	return nloops;
+}
+
+static void bench_energy(int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+{
+	int size;
+	int retval;
+	int ntasks;
+
+	for (size = STARTlin; size < END; size *= 2)
+	{
+		starpu_data_handle_t handle;
+		starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
+
+		struct starpu_task *task = starpu_task_create();
+		task->cl = codelet;
+		task->handles[0] = handle;
+		task->synchronous = 1;
+		task->destroy = 0;
+		task->flops = size;
+
+		if ( (retval = starpu_energy_start(STARPU_CPU_WORKER)) != 0)
+			ERROR_RETURN(retval);
+
+		/* Use a linear regression */
+		ntasks = test_memset_energy(size, where, archtype, impl, codelet);
+
+		if ( (retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, STARPU_CPU_WORKER)) != 0)
+			ERROR_RETURN(retval);
+
+		starpu_task_destroy (task);
+		starpu_data_unregister(handle);
+	}
+}
+
 static void show_task_perfs(int size, struct starpu_task *task)
 {
 	unsigned workerid;
@@ -227,5 +307,41 @@ int main(int argc, char **argv)
 #endif
 	starpu_shutdown();
 
+
+	starpu_conf_init(&conf);
+
+	/* Use a scheduler which doesn't choose the implementation */
+	conf.sched_policy_name = "eager";
+	conf.calibrate = 1;
+
+	ret = starpu_initialize(&conf, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("tests/perfmodels/opencl_memset_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+
+	memset_cl.cpu_funcs[1] = NULL;
+	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
+	memset_cl.cpu_funcs[1] = memset_cpu;
+	memset_cl.cpu_funcs[0] = NULL;
+	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
+
+	nl_memset_cl.cpu_funcs[1] = NULL;
+	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
+	nl_memset_cl.cpu_funcs[1] = memset_cpu;
+	nl_memset_cl.cpu_funcs[0] = NULL;
+	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
+
+#ifdef STARPU_USE_OPENCL
+        ret = starpu_opencl_unload_opencl(&opencl_program);
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
+#endif
+	starpu_shutdown();
+
+
 	return EXIT_SUCCESS;
 }