4 years ago · 88e11d0908
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -248,14 +248,8 @@ We have extended the performance model of StarPU to measure energy and power val
 
				 
			
 
				 
			
 
				 - To measure energy consumption of CPUs, we use the <c>RAPL</c> events, which are available on CPU architecture:
			
 
				-
			
 
				-const char* event_names[] = { "rapl::RAPL_ENERGY_PKG:cpu=%d",
			
 
				-                              "rapl::RAPL_ENERGY_DRAM:cpu=%d"};
			
 
				-
			
 
				-
			
 
				-Where <c>RAPL_ENERGY_PKG</c> represents the whole CPU socket power consumption.
			
 
				-
			
 
				-and <c>RAPL_ENERGY_DRAM</c> represents the RAM power consumption.
			
 
				+<c>RAPL_ENERGY_PKG</c> that represents the whole CPU socket power consumption,
			
 
				+and <c>RAPL_ENERGY_DRAM</c> that represents the RAM power consumption.
			
 
				 
			
 
				 
			
 
				 
			
@@ -269,6 +263,10 @@ In order to use the right <c>rapl events</c> for energy measurement, user should
 
				 $ papi_native_avail
			
 
				 \endverbatim
			
 
				 
			
 
				+Depending on the system configuration, the user may have to run this as <b>root</b> to get the performance counter values.
			
 
				+
			
 
				+Since the measurement is for all the the CPUs and the memory, the approach taken
			
 
				+here is to run a series of tasks on all of them and to take the overall measurement.
			
 
				 
			
 
				 - The example below illustrates the energy and power measurements, using <c>starpu_energy_start()</c> and <c>starpu_energy_stop()</c> functions. 
			
 
				  
			
@@ -282,13 +280,13 @@ In this example, we launch several tasks of the same type in parallel. To perfor
 
				        
			
 
				         unsigned N = starpu_cpu_worker_get_count() * 40;
			
 
				 
			
 
				-        starpu_energy_start();
			
 
				+        starpu_energy_start(-1, STARPU_CPU_WORKER);
			
 
				 
			
 
				         for (i = 0 ; i < N ; i++)
			
 
				-	  starpu_task_insert(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
			
 
				+	  starpu_task_insert(&cl, STARPU_EXECUTE_WHERE, STARPU_CPU, STARPU_R, arg1, STARPU_RW, arg2, 0);
			
 
				 
			
 
				         starpu_task_t *specimen = starpu_task_build(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
			
 
				-        starpu_energy_stop(&codelet.energy_model, specimen, N);
			
 
				+        starpu_energy_stop(&codelet.energy_model, specimen, 0, N, -1, STARPU_CPU_WORKER);
			
 
				 
			
 
				        . . .
			
 
				 
			
@@ -300,6 +298,28 @@ For the energy and power measurements, depending on the system configuration the
 
				 
			
 
				 <c>starpu_energy_stop()</c> function uses <c>PAPI_stop()</c> to stop counting and store the values into the array. we calculate both energy in <c>Joules</c> and power consumption in <c>Watt</c>. We call starpu_perfmodel_update_history() function in the perfmormance model to provide explicit measurements.
			
 
				 
			
 
				+- In the CUDA case, nvml provides per-GPU energy measurement. We can thus calibrate the performance models per GPU:
			
 
				+
			
 
				+\code{.c}
			
 
				+       
			
 
				+        unsigned N = 40;
			
 
				+
			
 
				+	for (i = 0; i < starpu_cuda_worker_get_count(); i++) {
			
 
				+		int workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, i);
			
 
				+
			
 
				+		starpu_energy_start(workerid, STARPU_CUDA_WORKER);
			
 
				+
			
 
				+		for (i = 0 ; i < N ; i++)
			
 
				+		  starpu_task_insert(&cl, STARPU_EXECUTE_ON_WORKER, workerid, STARPU_R, arg1, STARPU_RW, arg2, 0);
			
 
				+
			
 
				+		starpu_task_t *specimen = starpu_task_build(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
			
 
				+		starpu_energy_stop(&codelet.energy_model, specimen, 0, N, workerid, STARPU_CUDA_WORKER);
			
 
				+
			
 
				+       }
			
 
				+       . . .
			
 
				+
			
 
				+\endcode
			
 
				+
			
 
				 \section StaticScheduling Static Scheduling
			
 
				 
			
 
				 In some cases, one may want to force some scheduling, for instance force a given
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -320,17 +320,25 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
				 
			
 
				 /**
			
 
				    starpu_energy_start - start counting hardware events in an event set
			
 
				+
			
 
				+   - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
			
 
				+   - \p archi is the type of architecture on which calibration will be run
			
 
				 */
			
 
				 
			
 
				-int starpu_energy_start(enum starpu_worker_archtype archi);
			
 
				+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
			
 
				 
			
 
				 /**
			
 
				    starpu_energy_stop - stop counting hardware events in an event set
			
 
				-   \values -- an array to hold the counter values of the counting events
			
 
				-   \EventSet -- an integer handle for a PAPI event set as created by papi_create_eventset()
			
 
				+
			
 
				+   - \p model is the energy performance model to be filled with the result
			
 
				+   - \p task is a task specimen, so the performance model folds the result according to the parameter sizes of the task.
			
 
				+   - \p nimpl is the implementation number run during calibration
			
 
				+   - \p ntasks is the number of tasks run during calibration
			
 
				+   - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
			
 
				+   - \p archi is the type of architecture on which calibration was run
			
 
				 */
			
 
				 
			
 
				-int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi);
			
 
				+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
			
 
				 
			
 
				 
			
 
				 /**
			
--- a/src/core/perfmodel/energy_model.c
+++ b/src/core/perfmodel/energy_model.c
@@ -15,8 +15,12 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef STARPU_PAPI
			
 
				 #include <papi.h>
			
 
				-#include "hwloc.h"
			
 
				+#endif
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+#include <hwloc.h>
			
 
				+#endif
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <starpu_profiling.h>
			
 
				 #include <common/config.h>
			
@@ -39,14 +43,15 @@
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); } while (0)
			
 
				+#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
			
 
				 
			
 
				-#if 1
			
 
				+#if 0
			
 
				 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
			
 
				 #else
			
 
				 #define debug(fmt, ...)
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_PAPI
			
 
				 static const int N_EVTS = 2;
			
 
				 
			
 
				 static int nsockets;
			
@@ -65,6 +70,8 @@ static int EventSet = PAPI_NULL;
 
				 /*This is where we store the values we read from the eventset */
			
 
				 static long long *values;
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				 static double t1;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -74,18 +81,23 @@ static nvmlDevice_t device;
 
				 #endif
			
 
				 #endif
			
 
				 
			
 
				-int starpu_energy_start(enum starpu_worker_archtype archi)
			
 
				+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
			
 
				 {
			
 
				-	int retval, number;
			
 
				-	int i;
			
 
				-
			
 
				-	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				-	hwloc_topology_t topology = config->topology.hwtopology;
			
 
				-
			
 
				-	nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
			
 
				+	t1 = starpu_timing_now();
			
 
				 
			
 
				 	switch (archi) {
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				 	case STARPU_CPU_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
			
 
				+
			
 
				+		int retval, number;
			
 
				+
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		hwloc_topology_t topology = config->topology.hwtopology;
			
 
				+
			
 
				+		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
			
 
				 
			
 
				 		values=calloc(nsockets * N_EVTS,sizeof(long long));
			
 
				 		STARPU_ASSERT(values);
			
@@ -97,6 +109,8 @@ int starpu_energy_start(enum starpu_worker_archtype archi)
 
				 		if ( (retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
			
 
				 			ERROR_RETURN(retval);
			
 
				 
			
 
				+		int i;
			
 
				+
			
 
				 		for (i = 0 ; i < nsockets ; i ++ )
			
 
				 		{
			
 
				 			/* return the index of socket */
			
@@ -115,39 +129,54 @@ int starpu_energy_start(enum starpu_worker_archtype archi)
 
				 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
			
 
				 			ERROR_RETURN(retval);
			
 
				 
			
 
				-		t1 = starpu_timing_now();
			
 
				-		break;
			
 
				+		return retval;
			
 
				+	}
			
 
				+#endif
			
 
				+#endif
			
 
				 
			
 
				 
			
 
				 #ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 	case STARPU_CUDA_WORKER:
			
 
				 	{
			
 
				-		int ret = nvmlDeviceGetHandleByIndex_v2 (0,  &device);
			
 
				+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
			
 
				+		int devid = starpu_worker_get_devid(workerid);
			
 
				+		int ret = nvmlDeviceGetHandleByIndex_v2 (devid,  &device);
			
 
				+		if (ret != NVML_SUCCESS) {
			
 
				+			_STARPU_DISP("Could not get CUDA device %d from nvml\n", devid);
			
 
				+			return -1;
			
 
				+		}
			
 
				 		ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_begin );
			
 
				+		if (ret != NVML_SUCCESS) {
			
 
				+			_STARPU_DISP("Could not measure energy used by CUDA device %d\n", devid);
			
 
				+			return -1;
			
 
				+		}
			
 
				+		return 0;
			
 
				 	}
			
 
				 	break;
			
 
				 #endif
			
 
				 
			
 
				 	default:
			
 
				 		printf("Error: worker is not supported ! \n");
			
 
				-	break;
			
 
				+		return -1;
			
 
				 	}
			
 
				-
			
 
				-
			
 
				-	return retval;
			
 
				 }
			
 
				 
			
 
				-int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi)
			
 
				+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi)
			
 
				 {
			
 
				 	double energy = 0.;
			
 
				 
			
 
				 	int retval;
			
 
				-	unsigned workerid = 0;
			
 
				 	unsigned cpuid = 0;
			
 
				+
			
 
				 	double t2 = starpu_timing_now();
			
 
				-	double t = t2 - t1;
			
 
				+	double t STARPU_ATTRIBUTE_UNUSED = t2 - t1;
			
 
				+
			
 
				 	switch (archi) {
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				 	case STARPU_CPU_WORKER:
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
			
 
				 
			
 
				 		/* Stop counting and store the values into the array */
			
 
				 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
			
@@ -157,23 +186,18 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
				 
			
 
				 		for( s = 0 ; s < nsockets ; s ++){
			
 
				 			for(k = 0 ; k < N_EVTS; k++) {
			
 
				-				energy += values[s * N_EVTS + k];
			
 
				+				double delta = values[s * N_EVTS + k]*0.23/1.0e9;
			
 
				+				energy += delta;
			
 
				 
			
 
				 				debug("%-40s%12.6f J\t(for %f us, Average Power %.1fW)\n",
			
 
				 					event_names[k],
			
 
				-					(energy*0.23/1.0e9),
			
 
				-					t,
			
 
				-					((energy*0.23/1.0e9)/(t*1.0E-6))
			
 
				+					delta, t, delta/(t*1.0E-6)
			
 
				 				);
			
 
				 			}
			
 
				 		}
			
 
				 
			
 
				 		energy = energy * 0.23 / 1.0e9 / ntasks;
			
 
				 
			
 
				-		struct starpu_perfmodel_arch *arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
			
 
				-
			
 
				-		starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
			
 
				-
			
 
				 		/*removes all events from a PAPI event set */
			
 
				 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
			
 
				 			ERROR_RETURN(retval);
			
@@ -183,26 +207,46 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
				 			ERROR_RETURN(retval);
			
 
				 
			
 
				 		break;
			
 
				+	}
			
 
				+#endif
			
 
				+#endif
			
 
				 
			
 
				 #ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 	case STARPU_CUDA_WORKER:
			
 
				 	{
			
 
				+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
			
 
				 		int ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_end );
			
 
				-		debug("energy consumption on device %d is %lld mJ \n", 0, (energy_end - energy_begin));
			
 
				+		if (ret != NVML_SUCCESS)
			
 
				+			return -1;
			
 
				+		energy = (energy_end - energy_begin) / 1000.;
			
 
				+		debug("energy consumption on device %d is %f mJ (for %f us, Average power %0.1fW)\n", 0, energy * 1000., t, energy / (t*1.0E-6));
			
 
				 		break;
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				 	default:
			
 
				 		printf("Error: worker type %d is not supported! \n", archi);
			
 
				+		return -1;
			
 
				 		break;
			
 
				 
			
 
				 	}
			
 
				 
			
 
				+
			
 
				+	struct starpu_perfmodel_arch *arch;
			
 
				+	if (workerid == -1)
			
 
				+		/* Just take one of them */
			
 
				+		workerid = starpu_worker_get_by_type(archi, 0);
			
 
				+
			
 
				+	arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
			
 
				+
			
 
				+	starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
			
 
				+
			
 
				 	return retval;
			
 
				 
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_PAPI
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				 static int add_event(int eventSet, int socket)
			
 
				 {
			
 
				 	int retval, i;
			
@@ -233,3 +277,5 @@ static int add_event(int eventSet, int socket)
 
				 
			
 
				 	return ( PAPI_OK );
			
 
				 }
			
 
				+#endif
			
 
				+#endif
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -195,10 +195,11 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
				 	profiling_info = task->profiling_info;
			
 
				 	if (profiling_info && papi_nevents)
			
 
				 	{
			
 
				+		int i;
			
 
				 		profiling_info->papi_event_set = PAPI_NULL;
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				 		PAPI_create_eventset(&profiling_info->papi_event_set);
			
 
				-		for(int i=0; i<papi_nevents; i++)
			
 
				+		for(i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				 			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				 			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
			
@@ -224,9 +225,10 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 
				 
			
 
				 	if (profiling_info && papi_nevents)
			
 
				 	{
			
 
				+		int i;
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
			
 
				 		PAPI_stop(profiling_info->papi_event_set, profiling_info->papi_values);
			
 
				-		for(int i=0; i<papi_nevents; i++)
			
 
				+		for(i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				 			_STARPU_TRACE_PAPI_TASK_EVENT(papi_events[i], task, profiling_info->papi_values[i]);
			
 
				 		}
			
--- a/tests/perfmodels/regression_based_memset.c
+++ b/tests/perfmodels/regression_based_memset.c
@@ -19,7 +19,7 @@
 
				 #include <starpu_scheduler.h>
			
 
				 #include "../helper.h"
			
 
				 
			
 
				-#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); }
			
 
				+#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); }
			
 
				 
			
 
				 /*
			
 
				  * Benchmark memset with a linear and non-linear regression
			
@@ -160,12 +160,16 @@ static void test_memset(int nelems, struct starpu_codelet *codelet)
 
				         starpu_data_unregister(handle);
			
 
				 }
			
 
				 
			
 
				-static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
			
 
				+static int test_memset_energy(int nelems, int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
			
 
				 {
			
 
				-	int nloops = starpu_worker_get_count_by_type(archtype) * NENERGY;
			
 
				+	int nloops;
			
 
				 	int loop;
			
 
				-	starpu_data_handle_t handle[nloops];
			
 
				 
			
 
				+	nloops = NENERGY;
			
 
				+	if (workerid == -1)
			
 
				+		nloops *= starpu_worker_get_count_by_type(archtype);
			
 
				+
			
 
				+	starpu_data_handle_t handle[nloops];
			
 
				 	for (loop = 0; loop < nloops; loop++)
			
 
				 	{
			
 
				 		struct starpu_task *task = starpu_task_create();
			
@@ -175,6 +179,11 @@ static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype
 
				 		task->where = where;
			
 
				 		task->handles[0] = handle[loop];
			
 
				 		task->flops = nelems;
			
 
				+		if (workerid != -1)
			
 
				+		{
			
 
				+			task->execute_on_a_specific_worker = 1;
			
 
				+			task->workerid = workerid;
			
 
				+		}
			
 
				 
			
 
				 		int ret = starpu_task_submit(task);
			
 
				 		if (ret == -ENODEV)
			
@@ -190,7 +199,7 @@ static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype
 
				 	return nloops;
			
 
				 }
			
 
				 
			
 
				-static void bench_energy(int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
			
 
				+static int bench_energy(int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
			
 
				 {
			
 
				 	int size;
			
 
				 	int retval;
			
@@ -201,6 +210,12 @@ static void bench_energy(int where, enum starpu_worker_archtype archtype, int im
 
				 		starpu_data_handle_t handle;
			
 
				 		starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
			
 
				 
			
 
				+		if ( (retval = starpu_energy_start(workerid, archtype)) != 0)
			
 
				+			ERROR_RETURN(retval);
			
 
				+
			
 
				+		/* Use a linear regression */
			
 
				+		ntasks = test_memset_energy(size, workerid, where, archtype, impl, codelet);
			
 
				+
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				 		task->cl = codelet;
			
 
				 		task->handles[0] = handle;
			
@@ -208,13 +223,7 @@ static void bench_energy(int where, enum starpu_worker_archtype archtype, int im
 
				 		task->destroy = 0;
			
 
				 		task->flops = size;
			
 
				 
			
 
				-		if ( (retval = starpu_energy_start(STARPU_CPU_WORKER)) != 0)
			
 
				-			ERROR_RETURN(retval);
			
 
				-
			
 
				-		/* Use a linear regression */
			
 
				-		ntasks = test_memset_energy(size, where, archtype, impl, codelet);
			
 
				-
			
 
				-		if ( (retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, STARPU_CPU_WORKER)) != 0)
			
 
				+		if ( (retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, workerid, archtype)) != 0)
			
 
				 			ERROR_RETURN(retval);
			
 
				 
			
 
				 		starpu_task_destroy (task);
			
@@ -248,6 +257,7 @@ int main(int argc, char **argv)
 
				 	struct starpu_conf conf;
			
 
				 	starpu_data_handle_t handle;
			
 
				 	int ret;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	starpu_conf_init(&conf);
			
 
				 
			
@@ -324,17 +334,25 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				-	memset_cl.cpu_funcs[1] = NULL;
			
 
				-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
			
 
				-	memset_cl.cpu_funcs[1] = memset_cpu;
			
 
				-	memset_cl.cpu_funcs[0] = NULL;
			
 
				-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
			
 
				-
			
 
				-	nl_memset_cl.cpu_funcs[1] = NULL;
			
 
				-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
			
 
				-	nl_memset_cl.cpu_funcs[1] = memset_cpu;
			
 
				-	nl_memset_cl.cpu_funcs[0] = NULL;
			
 
				-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
			
 
				+	if (starpu_cpu_worker_get_count() > 0) {
			
 
				+		memset_cl.cpu_funcs[1] = NULL;
			
 
				+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
			
 
				+		memset_cl.cpu_funcs[1] = memset_cpu;
			
 
				+		memset_cl.cpu_funcs[0] = NULL;
			
 
				+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
			
 
				+
			
 
				+		nl_memset_cl.cpu_funcs[1] = NULL;
			
 
				+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
			
 
				+		nl_memset_cl.cpu_funcs[1] = memset_cpu;
			
 
				+		nl_memset_cl.cpu_funcs[0] = NULL;
			
 
				+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < starpu_cuda_worker_get_count(); i++) {
			
 
				+		int workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, i);
			
 
				+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &memset_cl);
			
 
				+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &nl_memset_cl);
			
 
				+	}
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         ret = starpu_opencl_unload_opencl(&opencl_program);