浏览代码

Inject nvml-based energy measurement in perfmodel

Samuel Thibault 4 年之前
父节点
当前提交
88e11d0908

+ 31 - 11
doc/doxygen/chapters/320_scheduling.doxy

@@ -248,14 +248,8 @@ We have extended the performance model of StarPU to measure energy and power val
 
 
 - To measure energy consumption of CPUs, we use the <c>RAPL</c> events, which are available on CPU architecture:
-
-const char* event_names[] = { "rapl::RAPL_ENERGY_PKG:cpu=%d",
-                              "rapl::RAPL_ENERGY_DRAM:cpu=%d"};
-
-
-Where <c>RAPL_ENERGY_PKG</c> represents the whole CPU socket power consumption.
-
-and <c>RAPL_ENERGY_DRAM</c> represents the RAM power consumption.
+<c>RAPL_ENERGY_PKG</c> that represents the whole CPU socket power consumption,
+and <c>RAPL_ENERGY_DRAM</c> that represents the RAM power consumption.
 
 
 
@@ -269,6 +263,10 @@ In order to use the right <c>rapl events</c> for energy measurement, user should
 $ papi_native_avail
 \endverbatim
 
+Depending on the system configuration, the user may have to run this as <b>root</b> to get the performance counter values.
+
+Since the measurement is for all the the CPUs and the memory, the approach taken
+here is to run a series of tasks on all of them and to take the overall measurement.
 
 - The example below illustrates the energy and power measurements, using <c>starpu_energy_start()</c> and <c>starpu_energy_stop()</c> functions. 
  
@@ -282,13 +280,13 @@ In this example, we launch several tasks of the same type in parallel. To perfor
        
         unsigned N = starpu_cpu_worker_get_count() * 40;
 
-        starpu_energy_start();
+        starpu_energy_start(-1, STARPU_CPU_WORKER);
 
         for (i = 0 ; i < N ; i++)
-	  starpu_task_insert(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
+	  starpu_task_insert(&cl, STARPU_EXECUTE_WHERE, STARPU_CPU, STARPU_R, arg1, STARPU_RW, arg2, 0);
 
         starpu_task_t *specimen = starpu_task_build(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
-        starpu_energy_stop(&codelet.energy_model, specimen, N);
+        starpu_energy_stop(&codelet.energy_model, specimen, 0, N, -1, STARPU_CPU_WORKER);
 
        . . .
 
@@ -300,6 +298,28 @@ For the energy and power measurements, depending on the system configuration the
 
 <c>starpu_energy_stop()</c> function uses <c>PAPI_stop()</c> to stop counting and store the values into the array. we calculate both energy in <c>Joules</c> and power consumption in <c>Watt</c>. We call starpu_perfmodel_update_history() function in the perfmormance model to provide explicit measurements.
 
+- In the CUDA case, nvml provides per-GPU energy measurement. We can thus calibrate the performance models per GPU:
+
+\code{.c}
+       
+        unsigned N = 40;
+
+	for (i = 0; i < starpu_cuda_worker_get_count(); i++) {
+		int workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, i);
+
+		starpu_energy_start(workerid, STARPU_CUDA_WORKER);
+
+		for (i = 0 ; i < N ; i++)
+		  starpu_task_insert(&cl, STARPU_EXECUTE_ON_WORKER, workerid, STARPU_R, arg1, STARPU_RW, arg2, 0);
+
+		starpu_task_t *specimen = starpu_task_build(&cl, STARPU_R, arg1, STARPU_RW, arg2, 0);
+		starpu_energy_stop(&codelet.energy_model, specimen, 0, N, workerid, STARPU_CUDA_WORKER);
+
+       }
+       . . .
+
+\endcode
+
 \section StaticScheduling Static Scheduling
 
 In some cases, one may want to force some scheduling, for instance force a given

+ 12 - 4
include/starpu_perfmodel.h

@@ -320,17 +320,25 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 /**
    starpu_energy_start - start counting hardware events in an event set
+
+   - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
+   - \p archi is the type of architecture on which calibration will be run
 */
 
-int starpu_energy_start(enum starpu_worker_archtype archi);
+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
 /**
    starpu_energy_stop - stop counting hardware events in an event set
-   \values -- an array to hold the counter values of the counting events
-   \EventSet -- an integer handle for a PAPI event set as created by papi_create_eventset()
+
+   - \p model is the energy performance model to be filled with the result
+   - \p task is a task specimen, so the performance model folds the result according to the parameter sizes of the task.
+   - \p nimpl is the implementation number run during calibration
+   - \p ntasks is the number of tasks run during calibration
+   - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
+   - \p archi is the type of architecture on which calibration was run
 */
 
-int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi);
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
 
 
 /**

+ 76 - 30
src/core/perfmodel/energy_model.c

@@ -15,8 +15,12 @@
  */
 
 #include <starpu.h>
+#ifdef STARPU_PAPI
 #include <papi.h>
-#include "hwloc.h"
+#endif
+#ifdef STARPU_HAVE_HWLOC
+#include <hwloc.h>
+#endif
 #include <starpu_perfmodel.h>
 #include <starpu_profiling.h>
 #include <common/config.h>
@@ -39,14 +43,15 @@
 #endif
 #endif
 
-#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); } while (0)
+#define ERROR_RETURN(retval) do { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); } while (0)
 
-#if 1
+#if 0
 #define debug(fmt, ...) printf(fmt, ## __VA_ARGS__)
 #else
 #define debug(fmt, ...)
 #endif
 
+#ifdef STARPU_PAPI
 static const int N_EVTS = 2;
 
 static int nsockets;
@@ -65,6 +70,8 @@ static int EventSet = PAPI_NULL;
 /*This is where we store the values we read from the eventset */
 static long long *values;
 
+#endif
+
 static double t1;
 
 #ifdef STARPU_USE_CUDA
@@ -74,18 +81,23 @@ static nvmlDevice_t device;
 #endif
 #endif
 
-int starpu_energy_start(enum starpu_worker_archtype archi)
+int starpu_energy_start(int workerid, enum starpu_worker_archtype archi)
 {
-	int retval, number;
-	int i;
-
-	struct _starpu_machine_config *config = _starpu_get_machine_config();
-	hwloc_topology_t topology = config->topology.hwtopology;
-
-	nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+	t1 = starpu_timing_now();
 
 	switch (archi) {
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
 	case STARPU_CPU_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
+
+		int retval, number;
+
+		struct _starpu_machine_config *config = _starpu_get_machine_config();
+		hwloc_topology_t topology = config->topology.hwtopology;
+
+		nsockets = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
 
 		values=calloc(nsockets * N_EVTS,sizeof(long long));
 		STARPU_ASSERT(values);
@@ -97,6 +109,8 @@ int starpu_energy_start(enum starpu_worker_archtype archi)
 		if ( (retval = PAPI_create_eventset(&EventSet)) != PAPI_OK)
 			ERROR_RETURN(retval);
 
+		int i;
+
 		for (i = 0 ; i < nsockets ; i ++ )
 		{
 			/* return the index of socket */
@@ -115,39 +129,54 @@ int starpu_energy_start(enum starpu_worker_archtype archi)
 		if ( (retval = PAPI_start(EventSet)) != PAPI_OK)
 			ERROR_RETURN(retval);
 
-		t1 = starpu_timing_now();
-		break;
+		return retval;
+	}
+#endif
+#endif
 
 
 #ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 	case STARPU_CUDA_WORKER:
 	{
-		int ret = nvmlDeviceGetHandleByIndex_v2 (0,  &device);
+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
+		int devid = starpu_worker_get_devid(workerid);
+		int ret = nvmlDeviceGetHandleByIndex_v2 (devid,  &device);
+		if (ret != NVML_SUCCESS) {
+			_STARPU_DISP("Could not get CUDA device %d from nvml\n", devid);
+			return -1;
+		}
 		ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_begin );
+		if (ret != NVML_SUCCESS) {
+			_STARPU_DISP("Could not measure energy used by CUDA device %d\n", devid);
+			return -1;
+		}
+		return 0;
 	}
 	break;
 #endif
 
 	default:
 		printf("Error: worker is not supported ! \n");
-	break;
+		return -1;
 	}
-
-
-	return retval;
 }
 
-int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, enum starpu_worker_archtype archi)
+int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi)
 {
 	double energy = 0.;
 
 	int retval;
-	unsigned workerid = 0;
 	unsigned cpuid = 0;
+
 	double t2 = starpu_timing_now();
-	double t = t2 - t1;
+	double t STARPU_ATTRIBUTE_UNUSED = t2 - t1;
+
 	switch (archi) {
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
 	case STARPU_CPU_WORKER:
+	{
+		STARPU_ASSERT_MSG(workerid == -1, "For CPUs we cannot measure each worker separately, use where = STARPU_CPU and leave workerid as -1\n");
 
 		/* Stop counting and store the values into the array */
 		if ( (retval = PAPI_stop(EventSet, values)) != PAPI_OK)
@@ -157,23 +186,18 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 
 		for( s = 0 ; s < nsockets ; s ++){
 			for(k = 0 ; k < N_EVTS; k++) {
-				energy += values[s * N_EVTS + k];
+				double delta = values[s * N_EVTS + k]*0.23/1.0e9;
+				energy += delta;
 
 				debug("%-40s%12.6f J\t(for %f us, Average Power %.1fW)\n",
 					event_names[k],
-					(energy*0.23/1.0e9),
-					t,
-					((energy*0.23/1.0e9)/(t*1.0E-6))
+					delta, t, delta/(t*1.0E-6)
 				);
 			}
 		}
 
 		energy = energy * 0.23 / 1.0e9 / ntasks;
 
-		struct starpu_perfmodel_arch *arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
-
-		starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
-
 		/*removes all events from a PAPI event set */
 		if ( (retval = PAPI_cleanup_eventset(EventSet)) != PAPI_OK)
 			ERROR_RETURN(retval);
@@ -183,26 +207,46 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 			ERROR_RETURN(retval);
 
 		break;
+	}
+#endif
+#endif
 
 #ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 	case STARPU_CUDA_WORKER:
 	{
+		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
 		int ret = nvmlDeviceGetTotalEnergyConsumption ( device, &energy_end );
-		debug("energy consumption on device %d is %lld mJ \n", 0, (energy_end - energy_begin));
+		if (ret != NVML_SUCCESS)
+			return -1;
+		energy = (energy_end - energy_begin) / 1000.;
+		debug("energy consumption on device %d is %f mJ (for %f us, Average power %0.1fW)\n", 0, energy * 1000., t, energy / (t*1.0E-6));
 		break;
 	}
 #endif
 
 	default:
 		printf("Error: worker type %d is not supported! \n", archi);
+		return -1;
 		break;
 
 	}
 
+
+	struct starpu_perfmodel_arch *arch;
+	if (workerid == -1)
+		/* Just take one of them */
+		workerid = starpu_worker_get_by_type(archi, 0);
+
+	arch = starpu_worker_get_perf_archtype(workerid, STARPU_NMAX_SCHED_CTXS);
+
+	starpu_perfmodel_update_history(model, task, arch, cpuid, nimpl, energy);
+
 	return retval;
 
 }
 
+#ifdef STARPU_PAPI
+#ifdef STARPU_HAVE_HWLOC
 static int add_event(int eventSet, int socket)
 {
 	int retval, i;
@@ -233,3 +277,5 @@ static int add_event(int eventSet, int socket)
 
 	return ( PAPI_OK );
 }
+#endif
+#endif

+ 4 - 2
src/profiling/profiling.c

@@ -195,10 +195,11 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 	profiling_info = task->profiling_info;
 	if (profiling_info && papi_nevents)
 	{
+		int i;
 		profiling_info->papi_event_set = PAPI_NULL;
 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_create_eventset(&profiling_info->papi_event_set);
-		for(int i=0; i<papi_nevents; i++)
+		for(i=0; i<papi_nevents; i++)
 		{
 			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
 			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
@@ -224,9 +225,10 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 
 	if (profiling_info && papi_nevents)
 	{
+		int i;
 		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_stop(profiling_info->papi_event_set, profiling_info->papi_values);
-		for(int i=0; i<papi_nevents; i++)
+		for(i=0; i<papi_nevents; i++)
 		{
 			_STARPU_TRACE_PAPI_TASK_EVENT(papi_events[i], task, profiling_info->papi_values[i]);
 		}

+ 41 - 23
tests/perfmodels/regression_based_memset.c

@@ -19,7 +19,7 @@
 #include <starpu_scheduler.h>
 #include "../helper.h"
 
-#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  exit(retval); }
+#define ERROR_RETURN(retval) { fprintf(stderr, "Error %d %s:line %d: \n", retval,__FILE__,__LINE__);  return(retval); }
 
 /*
  * Benchmark memset with a linear and non-linear regression
@@ -160,12 +160,16 @@ static void test_memset(int nelems, struct starpu_codelet *codelet)
         starpu_data_unregister(handle);
 }
 
-static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+static int test_memset_energy(int nelems, int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
 {
-	int nloops = starpu_worker_get_count_by_type(archtype) * NENERGY;
+	int nloops;
 	int loop;
-	starpu_data_handle_t handle[nloops];
 
+	nloops = NENERGY;
+	if (workerid == -1)
+		nloops *= starpu_worker_get_count_by_type(archtype);
+
+	starpu_data_handle_t handle[nloops];
 	for (loop = 0; loop < nloops; loop++)
 	{
 		struct starpu_task *task = starpu_task_create();
@@ -175,6 +179,11 @@ static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype
 		task->where = where;
 		task->handles[0] = handle[loop];
 		task->flops = nelems;
+		if (workerid != -1)
+		{
+			task->execute_on_a_specific_worker = 1;
+			task->workerid = workerid;
+		}
 
 		int ret = starpu_task_submit(task);
 		if (ret == -ENODEV)
@@ -190,7 +199,7 @@ static int test_memset_energy(int nelems, int where, enum starpu_worker_archtype
 	return nloops;
 }
 
-static void bench_energy(int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
+static int bench_energy(int workerid, int where, enum starpu_worker_archtype archtype, int impl, struct starpu_codelet *codelet)
 {
 	int size;
 	int retval;
@@ -201,6 +210,12 @@ static void bench_energy(int where, enum starpu_worker_archtype archtype, int im
 		starpu_data_handle_t handle;
 		starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, size, sizeof(int));
 
+		if ( (retval = starpu_energy_start(workerid, archtype)) != 0)
+			ERROR_RETURN(retval);
+
+		/* Use a linear regression */
+		ntasks = test_memset_energy(size, workerid, where, archtype, impl, codelet);
+
 		struct starpu_task *task = starpu_task_create();
 		task->cl = codelet;
 		task->handles[0] = handle;
@@ -208,13 +223,7 @@ static void bench_energy(int where, enum starpu_worker_archtype archtype, int im
 		task->destroy = 0;
 		task->flops = size;
 
-		if ( (retval = starpu_energy_start(STARPU_CPU_WORKER)) != 0)
-			ERROR_RETURN(retval);
-
-		/* Use a linear regression */
-		ntasks = test_memset_energy(size, where, archtype, impl, codelet);
-
-		if ( (retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, STARPU_CPU_WORKER)) != 0)
+		if ( (retval = starpu_energy_stop(codelet->energy_model, task, impl, ntasks, workerid, archtype)) != 0)
 			ERROR_RETURN(retval);
 
 		starpu_task_destroy (task);
@@ -248,6 +257,7 @@ int main(int argc, char **argv)
 	struct starpu_conf conf;
 	starpu_data_handle_t handle;
 	int ret;
+	unsigned i;
 
 	starpu_conf_init(&conf);
 
@@ -324,17 +334,25 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
-	memset_cl.cpu_funcs[1] = NULL;
-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
-	memset_cl.cpu_funcs[1] = memset_cpu;
-	memset_cl.cpu_funcs[0] = NULL;
-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
-
-	nl_memset_cl.cpu_funcs[1] = NULL;
-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
-	nl_memset_cl.cpu_funcs[1] = memset_cpu;
-	nl_memset_cl.cpu_funcs[0] = NULL;
-	bench_energy(STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
+	if (starpu_cpu_worker_get_count() > 0) {
+		memset_cl.cpu_funcs[1] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &memset_cl);
+		memset_cl.cpu_funcs[1] = memset_cpu;
+		memset_cl.cpu_funcs[0] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &memset_cl);
+
+		nl_memset_cl.cpu_funcs[1] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 0, &nl_memset_cl);
+		nl_memset_cl.cpu_funcs[1] = memset_cpu;
+		nl_memset_cl.cpu_funcs[0] = NULL;
+		bench_energy(-1, STARPU_CPU, STARPU_CPU_WORKER, 1, &nl_memset_cl);
+	}
+
+	for (i = 0; i < starpu_cuda_worker_get_count(); i++) {
+		int workerid = starpu_worker_get_by_type(STARPU_CUDA_WORKER, i);
+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &memset_cl);
+		bench_energy(workerid, STARPU_CUDA, STARPU_CUDA_WORKER, 0, &nl_memset_cl);
+	}
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_unload_opencl(&opencl_program);