Browse Source

preliminary registration framework for StarPU performance counters

Olivier Aumage 6 years ago
parent
commit
5293b45f81

+ 1 - 0
.gitignore

@@ -24,6 +24,7 @@
 .dirstamp
 .tramp_history
 *.pc
+*.vim
 stamp-h[0-9]*
 starpu.log
 /gcc-plugin/include/starpu-gcc/config.h

+ 2 - 1
Makefile.am

@@ -114,7 +114,8 @@ versinclude_HEADERS = 				\
 	include/starpu_simgrid_wrap.h		\
 	include/starpu_mod.f90			\
 	include/fstarpu_mod.f90			\
-	include/starpu_clusters.h
+	include/starpu_clusters.h		\
+	include/starpu_perf_monitoring.h
 
 nodist_versinclude_HEADERS = 			\
 	include/starpu_config.h

+ 2 - 0
examples/Makefile.am

@@ -255,6 +255,8 @@ STARPU_EXAMPLES +=				\
 	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
+	perf_monitoring/perf_counters_01	\
+	perf_monitoring/perf_counters_02	\
 	scheduler/heteroprio_test		\
 	sched_ctx/sched_ctx			\
 	sched_ctx/sched_ctx_empty		\

+ 131 - 0
examples/perf_monitoring/perf_counters_01.c

@@ -0,0 +1,131 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+#include <string.h>
+
+static void print_scope(const enum starpu_perf_counter_scope scope)
+{
+	int nb = starpu_perf_counter_nb(scope);
+	int i;
+	printf("scope %s\n", starpu_perf_counter_scope_id_to_name(scope));
+	for (i=0; i<nb; i++)
+	{
+		const int id = starpu_perf_counter_nth_to_id(scope, i);
+		const char *name = starpu_perf_counter_id_to_name(id);
+		const char *help = starpu_perf_counter_get_help_string(id);
+		int type_id = starpu_perf_counter_get_type_id(id);
+		const char *type_name = starpu_perf_counter_type_id_to_name(type_id);
+		printf("%d/%d - %s (0x%08x): [%s] / %s\n", i+1, nb, name, id, type_name, help);
+	}
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	{
+		int id;
+
+		id = starpu_perf_counter_scope_name_to_id("global");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_global);
+		
+		id = starpu_perf_counter_scope_name_to_id("per_worker");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_per_worker);
+		
+		id = starpu_perf_counter_scope_name_to_id("per_codelet");
+		STARPU_ASSERT(id == starpu_perf_counter_scope_per_codelet);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_global);
+		STARPU_ASSERT(strcmp(name, "global") == 0);
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_per_worker);
+		STARPU_ASSERT(strcmp(name, "per_worker") == 0);
+		
+		name = starpu_perf_counter_scope_id_to_name(starpu_perf_counter_scope_per_codelet);
+		STARPU_ASSERT(strcmp(name, "per_codelet") == 0);
+
+		(void)name;
+	}
+
+	{
+		int id;
+
+		id = starpu_perf_counter_type_name_to_id("int32");
+		STARPU_ASSERT(id == starpu_perf_counter_type_int32);
+
+		id = starpu_perf_counter_type_name_to_id("int64");
+		STARPU_ASSERT(id == starpu_perf_counter_type_int64);
+
+		id = starpu_perf_counter_type_name_to_id("float");
+		STARPU_ASSERT(id == starpu_perf_counter_type_float);
+
+		id = starpu_perf_counter_type_name_to_id("double");
+		STARPU_ASSERT(id == starpu_perf_counter_type_double);
+
+		(void)id;
+	}
+
+	{
+		const char *name;
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_int32);
+		STARPU_ASSERT(strcmp(name, "int32") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_int64);
+		STARPU_ASSERT(strcmp(name, "int64") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_float);
+		STARPU_ASSERT(strcmp(name, "float") == 0);
+		
+		name = starpu_perf_counter_type_id_to_name(starpu_perf_counter_type_double);
+		STARPU_ASSERT(strcmp(name, "double") == 0);
+
+		(void)name;
+	}
+
+	printf("programmatically get counters per scope\n");
+	print_scope(starpu_perf_counter_scope_global);
+	print_scope(starpu_perf_counter_scope_per_worker);
+	print_scope(starpu_perf_counter_scope_per_codelet);
+	printf("\n");
+
+	printf("list available counters per scope\n");
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_global);
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_worker);
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_codelet);
+	printf("\n");
+
+	printf("list all available counters\n");
+	starpu_perf_counter_list_all_avail(starpu_perf_counter_scope_global);
+	printf("\n");
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 241 - 0
examples/perf_monitoring/perf_counters_02.c

@@ -0,0 +1,241 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <assert.h>
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+/* global counters */
+static int id_g_total_submitted;
+static int id_g_peak_submitted;
+static int id_g_peak_ready;
+
+/* per worker counters */
+static int id_w_total_executed;
+static int id_w_cumul_execution_time;
+
+/* per_codelet counters */
+static int id_c_total_submitted;
+static int id_c_peak_submitted;
+static int id_c_peak_ready;
+static int id_c_total_executed;
+static int id_c_cumul_execution_time;
+
+void g_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	(void) context;
+	int32_t g_total_submitted = starpu_perf_counter_sample_get_int32_value(sample, id_g_total_submitted);
+	int32_t g_peak_submitted = starpu_perf_counter_sample_get_int32_value(sample, id_g_peak_submitted);
+	int32_t g_peak_ready = starpu_perf_counter_sample_get_int32_value(sample, id_g_peak_ready);
+	printf("global: g_total_submitted = %d, g_peak_submitted = %d, g_peak_ready = %d\n", g_total_submitted, g_peak_submitted, g_peak_ready);
+}
+
+void w_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	(void) context;
+	int workerid = starpu_worker_get_id();
+	int32_t w_total_executed = starpu_perf_counter_sample_get_int32_value(sample, id_w_total_executed);
+	double w_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_w_cumul_execution_time);
+
+	printf("worker[%d]: w_total_executed = %d, w_cumul_execution_time = %lf\n", workerid, w_total_executed, w_cumul_execution_time);
+}
+
+void c_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
+{
+	(void) listener;
+	struct starpu_codelet *cl = context;
+	int32_t c_total_submitted = starpu_perf_counter_sample_get_int32_value(sample, id_c_total_submitted);
+	int32_t c_peak_submitted = starpu_perf_counter_sample_get_int32_value(sample, id_c_peak_submitted);
+	int32_t c_peak_ready = starpu_perf_counter_sample_get_int32_value(sample, id_c_peak_ready);
+	int32_t c_total_executed = starpu_perf_counter_sample_get_int32_value(sample, id_c_total_executed);
+	double c_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_c_cumul_execution_time);
+	if (cl->name == NULL)
+	{
+		printf("codelet[%s]: c_total_submitted = %d, c_peak_submitted = %d, c_peak_ready = %d, c_total_executed = %d, c_cumul_execution_time = %lf\n", cl->name, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+	}
+	else
+	{
+		printf("codelet[%p]: c_total_submitted = %d, c_peak_submitted = %d, c_peak_ready = %d, c_total_executed = %d, c_cumul_execution_time = %lf\n", cl, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+	}
+}
+
+void f(void *buffers[], void *cl_args)
+{
+	int *int_vector = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
+	int NX = (int)STARPU_VECTOR_GET_NX(buffers[0]);
+	const int niters;
+	starpu_codelet_unpack_args(cl_args, &niters);
+	int i;
+	for (i=0; i<niters; i++)
+	{
+		int_vector[i % NX] += i;
+	}
+}
+
+struct starpu_codelet cl =
+{
+	.cpu_funcs      = {f},
+	.cpu_funcs_name = {"f"},
+	.nbuffers       = 1,
+	.name           = "perf_counter_f"
+};
+
+const enum starpu_perf_counter_scope g_scope = starpu_perf_counter_scope_global;
+const enum starpu_perf_counter_scope w_scope = starpu_perf_counter_scope_per_worker;
+const enum starpu_perf_counter_scope c_scope = starpu_perf_counter_scope_per_codelet;
+
+#define NVECTORS 5
+#define NTASKS 1000
+#define NITER 1000
+#define VECTOR_LEN 2
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	struct starpu_perf_counter_set *g_set = starpu_perf_counter_set_alloc(g_scope);
+	STARPU_ASSERT(g_set != NULL);
+	struct starpu_perf_counter_set *w_set = starpu_perf_counter_set_alloc(w_scope);
+	STARPU_ASSERT(w_set != NULL);
+	struct starpu_perf_counter_set *c_set = starpu_perf_counter_set_alloc(c_scope);
+	STARPU_ASSERT(c_set != NULL);
+
+	id_g_total_submitted = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_total_submitted");
+	STARPU_ASSERT(id_g_total_submitted != -1);
+	id_g_peak_submitted = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_peak_submitted");
+	STARPU_ASSERT(id_g_peak_submitted != -1);
+	id_g_peak_ready = starpu_perf_counter_name_to_id(g_scope, "starpu.task.g_peak_ready");
+	STARPU_ASSERT(id_g_peak_ready != -1);
+
+
+	id_w_total_executed = starpu_perf_counter_name_to_id(w_scope, "starpu.task.w_total_executed");
+	STARPU_ASSERT(id_w_total_executed != -1);
+	id_w_cumul_execution_time = starpu_perf_counter_name_to_id(w_scope, "starpu.task.w_cumul_execution_time");
+	STARPU_ASSERT(id_w_cumul_execution_time != -1);
+
+	id_c_total_submitted = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_total_submitted");
+	STARPU_ASSERT(id_c_total_submitted != -1);
+	id_c_peak_submitted = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_peak_submitted");
+	STARPU_ASSERT(id_c_peak_submitted != -1);
+	id_c_peak_ready = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_peak_ready");
+	STARPU_ASSERT(id_c_peak_ready != -1);
+	id_c_total_executed = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_total_executed");
+	STARPU_ASSERT(id_c_total_executed != -1);
+	id_c_cumul_execution_time = starpu_perf_counter_name_to_id(c_scope, "starpu.task.c_cumul_execution_time");
+	STARPU_ASSERT(id_c_cumul_execution_time != -1);
+
+	starpu_perf_counter_set_enable_id(g_set, id_g_total_submitted);
+	starpu_perf_counter_set_enable_id(g_set, id_g_peak_submitted);
+	starpu_perf_counter_set_enable_id(g_set, id_g_peak_ready);
+
+	starpu_perf_counter_set_enable_id(w_set, id_w_total_executed);
+	starpu_perf_counter_set_enable_id(w_set, id_w_cumul_execution_time);
+
+	starpu_perf_counter_set_enable_id(c_set, id_c_total_submitted);
+	starpu_perf_counter_set_enable_id(c_set, id_c_peak_submitted);
+	starpu_perf_counter_set_enable_id(c_set, id_c_peak_ready);
+	starpu_perf_counter_set_enable_id(c_set, id_c_total_executed);
+	starpu_perf_counter_set_enable_id(c_set, id_c_cumul_execution_time);
+
+	struct starpu_perf_counter_listener * g_listener = starpu_perf_counter_listener_init(g_set, g_listener_cb, (void *)(uintptr_t)42);
+	struct starpu_perf_counter_listener * w_listener = starpu_perf_counter_listener_init(w_set, w_listener_cb, (void *)(uintptr_t)17);
+	struct starpu_perf_counter_listener * c_listener = starpu_perf_counter_listener_init(c_set, c_listener_cb, (void *)(uintptr_t)76);
+
+	starpu_perf_counter_set_global_listener(g_listener);
+	starpu_perf_counter_set_all_per_worker_listeners(w_listener);
+
+	starpu_perf_counter_set_per_codelet_listener(&cl, c_listener);
+
+	int* vector[NVECTORS];
+	starpu_data_handle_t vector_h[NVECTORS];
+	int v;
+	for (v=0; v<NVECTORS; v++)
+	{
+		vector[v] = calloc(VECTOR_LEN, sizeof(*vector));
+		STARPU_ASSERT(vector[v] != NULL);
+
+		{
+			int i;
+			for (i=0; i<VECTOR_LEN; i++)
+			{
+				vector[v][i] = i;
+			}
+		}
+
+		starpu_vector_data_register(&vector_h[v], STARPU_MAIN_RAM, (uintptr_t)vector[v], VECTOR_LEN, sizeof(*vector[v]));
+	}
+
+	{
+		int i;
+		for (i=0; i<NTASKS; i++)
+		{
+			v = i % NVECTORS;
+			const int niter = NITER;
+			starpu_insert_task(&cl,
+					STARPU_RW, vector_h[v],
+					STARPU_VALUE, &niter, sizeof(int),
+					0);
+		}
+	}
+
+	for (v=0; v<NVECTORS; v++)
+	{
+		starpu_data_unregister(vector_h[v]);
+		free(vector[v]);
+	}
+
+	starpu_perf_counter_unset_per_codelet_listener(&cl);
+	starpu_perf_counter_unset_all_per_worker_listeners();
+	starpu_perf_counter_unset_global_listener();
+
+	starpu_perf_counter_listener_exit(c_listener);
+	starpu_perf_counter_listener_exit(w_listener);
+	starpu_perf_counter_listener_exit(g_listener);
+
+	starpu_perf_counter_set_disable_id(c_set, id_c_cumul_execution_time);
+	starpu_perf_counter_set_disable_id(c_set, id_c_total_executed);
+	starpu_perf_counter_set_disable_id(c_set, id_c_peak_ready);
+	starpu_perf_counter_set_disable_id(c_set, id_c_peak_submitted);
+	starpu_perf_counter_set_disable_id(c_set, id_c_total_submitted);
+
+	starpu_perf_counter_set_disable_id(w_set, id_w_cumul_execution_time);
+	starpu_perf_counter_set_disable_id(w_set, id_w_total_executed);
+
+	starpu_perf_counter_set_disable_id(g_set, id_g_peak_ready);
+	starpu_perf_counter_set_disable_id(g_set, id_g_peak_submitted);
+	starpu_perf_counter_set_disable_id(g_set, id_g_total_submitted);
+
+	starpu_perf_counter_set_free(c_set);
+	c_set = NULL;
+
+	starpu_perf_counter_set_free(w_set);
+	w_set = NULL;
+
+	starpu_perf_counter_set_free(g_set);
+	g_set = NULL;
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 1 - 0
include/starpu.h

@@ -84,6 +84,7 @@ typedef INT_PTR intptr_t;
 #include <starpu_simgrid_wrap.h>
 #include <starpu_bitmap.h>
 #include <starpu_clusters.h>
+#include <starpu_perf_monitoring.h>
 
 #ifdef __cplusplus
 extern "C"

+ 99 - 0
include/starpu_perf_monitoring.h

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_PERF_MONITORING_H__
+#define __STARPU_PERF_MONITORING_H__
+
+#include <starpu.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+   @defgroup API_Perf_Monitoring Perf_Monitoring
+   @{
+*/
+
+enum starpu_perf_counter_scope
+{
+	starpu_perf_counter_scope_undefined     = 0,
+	starpu_perf_counter_scope_global        = 1,
+	starpu_perf_counter_scope_per_worker    = 2,
+	starpu_perf_counter_scope_per_codelet   = 3
+};
+
+enum starpu_perf_counter_type
+{
+	starpu_perf_counter_type_undefined = 0,
+	starpu_perf_counter_type_int32     = 1,
+	starpu_perf_counter_type_int64     = 2,
+	starpu_perf_counter_type_float     = 3,
+	starpu_perf_counter_type_double    = 4
+};
+
+struct starpu_perf_counter_listener;
+struct starpu_perf_counter_sample;
+struct starpu_perf_counter_set;
+
+int starpu_perf_counter_scope_name_to_id(const char *name);
+const char *starpu_perf_counter_scope_id_to_name(enum starpu_perf_counter_scope scope);
+
+int starpu_perf_counter_type_name_to_id(const char *name);
+const char *starpu_perf_counter_type_id_to_name(enum starpu_perf_counter_type type);
+
+int starpu_perf_counter_nb(enum starpu_perf_counter_scope scope);
+int starpu_perf_counter_name_to_id(enum starpu_perf_counter_scope scope, const char *name);
+int starpu_perf_counter_nth_to_id(enum starpu_perf_counter_scope scope, int nth);
+const char *starpu_perf_counter_id_to_name(int id);
+int starpu_perf_counter_get_type_id(int id);
+const char *starpu_perf_counter_get_help_string(int id);
+
+void starpu_perf_counter_list_avail(enum starpu_perf_counter_scope scope);
+void starpu_perf_counter_list_all_avail(enum starpu_perf_counter_scope scope);
+
+struct starpu_perf_counter_set *starpu_perf_counter_set_alloc(enum starpu_perf_counter_scope scope);
+void starpu_perf_counter_set_free(struct starpu_perf_counter_set *set);
+
+void starpu_perf_counter_set_enable_id(struct starpu_perf_counter_set *set, int id);
+void starpu_perf_counter_set_disable_id(struct starpu_perf_counter_set *set, int id);
+
+struct starpu_perf_counter_listener *starpu_perf_counter_listener_init(struct starpu_perf_counter_set *set, void (*callback)(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context), void *user_arg);
+void starpu_perf_counter_listener_exit(struct starpu_perf_counter_listener *listener);
+
+void starpu_perf_counter_set_global_listener(struct starpu_perf_counter_listener *listener);
+void starpu_perf_counter_set_per_worker_listener(unsigned workerid, struct starpu_perf_counter_listener *listener);
+void starpu_perf_counter_set_all_per_worker_listeners(struct starpu_perf_counter_listener *listener);
+void starpu_perf_counter_set_per_codelet_listener(struct starpu_codelet *cl, struct starpu_perf_counter_listener *listener);
+
+void starpu_perf_counter_unset_global_listener();
+void starpu_perf_counter_unset_per_worker_listener(unsigned workerid);
+void starpu_perf_counter_unset_all_per_worker_listeners(void);
+void starpu_perf_counter_unset_per_codelet_listener(struct starpu_codelet *cl);
+
+int32_t starpu_perf_counter_sample_get_int32_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+int64_t starpu_perf_counter_sample_get_int64_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+float starpu_perf_counter_sample_get_float_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+double starpu_perf_counter_sample_get_double_value(struct starpu_perf_counter_sample *sample, const int counter_id);
+
+/** @} */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_PERF_MONITORING_H__ */

+ 4 - 1
include/starpu_task.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017                                Inria
+ * Copyright (C) 2011-2017,2019                           Inria
  * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2010-2015,2017,2018,2019                 CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
@@ -558,6 +558,9 @@ struct starpu_codelet
 	   Various flags for the codelet.
 	 */
 	int flags;
+
+	struct starpu_perf_counter_sample *perf_counter_sample;
+	struct starpu_perf_counter_sample_cl_values *perf_counter_values;
 };
 
 /**

+ 3 - 1
src/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2011-2017                                Inria
+# Copyright (C) 2011-2017,2019                           Inria
 # Copyright (C) 2012                                     Benjamin Lorendeau
 # Copyright (C) 2009-2019                                Université de Bordeaux
 # Copyright (C) 2010-2015,2017,2018,2019                 CNRS
@@ -129,6 +129,7 @@ noinst_HEADERS = 						\
 	common/rbtree_i.h					\
 	common/prio_list.h					\
 	common/graph.h						\
+	common/knobs.h						\
 	drivers/driver_common/driver_common.h			\
 	drivers/mp_common/mp_common.h				\
 	drivers/mp_common/source_common.h			\
@@ -176,6 +177,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	common/rbtree.c						\
 	common/graph.c						\
 	common/inlines.c					\
+	common/knobs.c						\
 	core/jobs.c						\
 	core/task.c						\
 	core/task_bundle.c					\

+ 519 - 0
src/common/knobs.c

@@ -0,0 +1,519 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Performance counters and configurable knobs */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <starpu.h>
+#include <common/config.h>
+#include <common/starpu_spinlock.h>
+#include <core/workers.h>
+#include <common/knobs.h>
+
+struct perf_counter_array
+{
+	int size;
+	struct starpu_perf_counter *array;
+	int updater_array_size;
+	void (**updater_array)(struct starpu_perf_counter_sample *sample, void *context);
+};
+
+static struct perf_counter_array global_counters	= { .size = 0, .array = NULL, .updater_array_size = 0, .updater_array = NULL };
+static struct perf_counter_array per_worker_counters	= { .size = 0, .array = NULL, .updater_array_size = 0, .updater_array = NULL };
+static struct perf_counter_array per_codelet_counters	= { .size = 0, .array = NULL, .updater_array_size = 0, .updater_array = NULL };
+
+static struct starpu_perf_counter_sample global_sample	= { .scope = starpu_perf_counter_scope_global, .listener = NULL, .value_array = NULL };
+
+/* - */
+
+void _starpu_perf_counter_sample_init(struct starpu_perf_counter_sample *sample, enum starpu_perf_counter_scope scope)
+{
+	STARPU_ASSERT_PERF_COUNTER_SCOPE_DEFINED(scope);
+	sample->scope = scope;
+	sample->listener = NULL;
+	sample->value_array = NULL;
+	_starpu_spin_init(&sample->lock);
+}
+
+void _starpu_perf_counter_sample_exit(struct starpu_perf_counter_sample *sample)
+{
+	STARPU_ASSERT(sample->listener == NULL);
+	sample->listener = NULL;
+	if (sample->value_array)
+	{
+		free(sample->value_array);
+	}
+	sample->value_array = NULL;
+	sample->scope = starpu_perf_counter_scope_undefined;
+	_starpu_spin_destroy(&sample->lock);
+}
+
+/* - */
+
+void _starpu_perf_counter_init(void)
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+	_starpu_perf_counter_sample_init(&global_sample, starpu_perf_counter_scope_global);
+
+	/* call counter registration routines in each modules */
+	_starpu__task_c__register_counters();
+}
+
+void _starpu_perf_counter_exit(void)
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+
+	_starpu_perf_counter_unregister_all_scopes();
+	_starpu_perf_counter_sample_exit(&global_sample);
+}
+
+/* - */
+
+int starpu_perf_counter_scope_name_to_id(const char * const name)
+{
+	if (strcmp(name, "global") == 0)
+		return starpu_perf_counter_scope_global;
+	if (strcmp(name, "per_worker") == 0)
+		return starpu_perf_counter_scope_per_worker;
+	if (strcmp(name, "per_codelet") == 0)
+		return starpu_perf_counter_scope_per_codelet;
+	return -1;
+}
+
+const char *starpu_perf_counter_scope_id_to_name(const enum starpu_perf_counter_scope scope)
+{
+	switch (scope)
+	{
+		case starpu_perf_counter_scope_global:
+			return "global";
+
+		case starpu_perf_counter_scope_per_worker:
+			return "per_worker";
+
+		case starpu_perf_counter_scope_per_codelet:
+			return "per_codelet";
+
+		default:
+			return NULL;
+	};
+}
+
+/* - */
+
+int starpu_perf_counter_type_name_to_id(const char * const name)
+{
+	if (strcmp(name, "int32") == 0)
+		return starpu_perf_counter_type_int32;
+	if (strcmp(name, "int64") == 0)
+		return starpu_perf_counter_type_int64;
+	if (strcmp(name, "float") == 0)
+		return starpu_perf_counter_type_float;
+	if (strcmp(name, "double") == 0)
+		return starpu_perf_counter_type_double;
+	return -1;
+}
+
+const char *starpu_perf_counter_type_id_to_name(const enum starpu_perf_counter_type type)
+{
+	switch (type)
+	{
+		case starpu_perf_counter_type_int32:
+			return "int32";
+
+		case starpu_perf_counter_type_int64:
+			return "int64";
+
+		case starpu_perf_counter_type_float:
+			return "float";
+
+		case starpu_perf_counter_type_double:
+			return "double";
+
+		default:
+			return NULL;
+	};
+}
+
+static struct perf_counter_array *_get_counters(const enum starpu_perf_counter_scope scope)
+{
+	STARPU_ASSERT_PERF_COUNTER_SCOPE_DEFINED(scope);
+	switch (scope)
+	{
+		case starpu_perf_counter_scope_global:
+			return &global_counters;
+
+		case starpu_perf_counter_scope_per_worker:
+			return &per_worker_counters;
+
+		case starpu_perf_counter_scope_per_codelet:
+			return &per_codelet_counters;
+
+		default:
+			STARPU_ABORT();
+	};
+};
+
+/* - */
+
+int _starpu_perf_counter_register(enum starpu_perf_counter_scope scope, const char *name, enum starpu_perf_counter_type type, const char *help)
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+
+	struct perf_counter_array * const counters = _get_counters(scope);
+	STARPU_ASSERT_PERF_COUNTER_TYPE_DEFINED(type);
+
+	const int index = counters->size++;
+	_STARPU_REALLOC(counters->array, counters->size * sizeof(*counters->array));
+
+	struct starpu_perf_counter * const new_counter = &counters->array[index];
+	const int id = _starpu_perf_counter_id_build(scope, index);
+	new_counter->id = id;
+	new_counter->name = name;
+	new_counter->help = help;
+	new_counter->type = type;
+
+	return id;
+}
+
+static void _unregister_scope(enum starpu_perf_counter_scope scope)
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+
+	struct perf_counter_array * const counters = _get_counters(scope);
+	free(counters->array);
+	counters->array = NULL;
+	counters->size  = 0;
+}
+
+void _starpu_perf_counter_unregister_all_scopes(void)
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+
+	_unregister_scope(starpu_perf_counter_scope_global);
+	_unregister_scope(starpu_perf_counter_scope_per_worker);
+	_unregister_scope(starpu_perf_counter_scope_per_codelet);
+}
+
+/* - */
+
+int starpu_perf_counter_nb(enum starpu_perf_counter_scope scope)
+{
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	return counters->size;
+}
+
+int starpu_perf_counter_nth_to_id(enum starpu_perf_counter_scope scope, int nth)
+{
+	return _starpu_perf_counter_id_build(scope, nth);
+}
+
+int starpu_perf_counter_name_to_id(enum starpu_perf_counter_scope scope, const char *name)
+{
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	int index;
+	for (index = 0; index < counters->size; index++)
+	{
+		if (strcmp(name, counters->array[index].name) == 0)
+		{
+			return _starpu_perf_counter_id_build(scope, index);
+		}
+	}
+	return -1;
+}
+
+const char *starpu_perf_counter_id_to_name(int id)
+{
+	const int scope = _starpu_perf_counter_id_get_scope(id);
+	const int index = _starpu_perf_counter_id_get_index(id);
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	if (index < 0 || index >= counters->size)
+		return NULL;
+	return counters->array[index].name;
+}
+
+const char *starpu_perf_counter_get_help_string(int id)
+{
+	const int scope = _starpu_perf_counter_id_get_scope(id);
+	const int index = _starpu_perf_counter_id_get_index(id);
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	STARPU_ASSERT(index >= 0 && index < counters->size);
+	return counters->array[index].help;
+}
+
+int starpu_perf_counter_get_type_id(int id)
+{
+	const int scope = _starpu_perf_counter_id_get_scope(id);
+	const int index = _starpu_perf_counter_id_get_index(id);
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	STARPU_ASSERT(index >= 0 && index < counters->size);
+	return counters->array[index].type;
+}
+
+/* - */
+
+void starpu_perf_counter_list_avail(enum starpu_perf_counter_scope scope)
+{
+	const struct perf_counter_array * const counters = _get_counters(scope);
+	int index;
+	for (index = 0; index < counters->size; index++)
+	{
+		const struct starpu_perf_counter * const counter = &counters->array[index];
+		printf("0x%08x:%s [%s] - %s\n", _starpu_perf_counter_id_build(scope, index), counter->name, starpu_perf_counter_type_id_to_name(counter->type), counter->help);
+	}
+}
+
+void starpu_perf_counter_list_all_avail(enum starpu_perf_counter_scope scope)
+{
+	printf("scope: global\n");
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_global);
+
+	printf("scope: per_worker\n");
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_worker);
+
+	printf("scope: per_codelet\n");
+	starpu_perf_counter_list_avail(starpu_perf_counter_scope_per_codelet);
+}
+
+/* - */
+
+struct starpu_perf_counter_set *starpu_perf_counter_set_alloc(enum starpu_perf_counter_scope scope)
+{
+	struct perf_counter_array *counters = _get_counters(scope);
+	struct starpu_perf_counter_set *set;
+	_STARPU_MALLOC(set, sizeof(*set));
+	set->scope = scope;
+	set->size  = counters->size;
+	_STARPU_CALLOC(set->index_array, set->size, sizeof(*set->index_array));
+	return set;
+}
+
+void starpu_perf_counter_set_free(struct starpu_perf_counter_set *set)
+{
+	memset(set->index_array, 0, set->size*sizeof(*set->index_array));
+	free(set->index_array);
+	memset(set, 0, sizeof(*set));
+	free(set);
+}
+
+/* - */
+
+void starpu_perf_counter_set_enable_id(struct starpu_perf_counter_set *set, int id)
+{
+	const int index = _starpu_perf_counter_id_get_index(id);
+	STARPU_ASSERT(index >= 0 && index < set->size);
+	set->index_array[index] = 1;
+}
+
+void starpu_perf_counter_set_disable_id(struct starpu_perf_counter_set *set, int id)
+{
+	const int index = _starpu_perf_counter_id_get_index(id);
+	STARPU_ASSERT(index >= 0 && index < set->size);
+	set->index_array[index] = 0;
+}
+
+/* - */
+
+struct starpu_perf_counter_listener *starpu_perf_counter_listener_init(struct starpu_perf_counter_set *set,
+		void (*callback)(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context),
+		void *user_arg)
+{
+	struct starpu_perf_counter_listener *listener;
+	_STARPU_MALLOC(listener, sizeof(*listener));
+	listener->set = set;
+	listener->callback = callback;
+	listener->user_arg = user_arg;
+	return listener;
+}
+
+void starpu_perf_counter_listener_exit(struct starpu_perf_counter_listener *listener)
+{
+	memset(listener, 0, sizeof(*listener));
+	free(listener);
+}
+
+/* - */
+
+static void set_listener(struct starpu_perf_counter_sample *sample, struct starpu_perf_counter_listener *listener)
+{
+	_starpu_spin_lock(&sample->lock);
+	STARPU_ASSERT(sample->listener == NULL);
+
+	STARPU_ASSERT(listener->set != NULL);
+	STARPU_ASSERT(listener->set->scope == sample->scope);
+
+	sample->listener = listener;
+
+	/* Assume a single listener, for now, which sets the set of counters to monitor */
+	STARPU_ASSERT(sample->value_array == NULL);
+	_STARPU_CALLOC(sample->value_array, sample->listener->set->size, sizeof(*sample->value_array));
+	_starpu_spin_unlock(&sample->lock);
+}
+
+
+void starpu_perf_counter_set_global_listener(struct starpu_perf_counter_listener *listener)
+{
+	set_listener(&global_sample, listener);
+}
+
+void starpu_perf_counter_set_per_worker_listener(unsigned workerid, struct starpu_perf_counter_listener *listener)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	set_listener(&worker->perf_counter_sample, listener);
+}
+
+void starpu_perf_counter_set_all_per_worker_listeners(struct starpu_perf_counter_listener *listener)
+{
+	unsigned nworkers = _starpu_worker_get_count();
+	unsigned workerid;
+	for (workerid = 0; workerid < nworkers; workerid++)
+	{
+		starpu_perf_counter_set_per_worker_listener(workerid, listener);
+	}
+}
+
+void starpu_perf_counter_set_per_codelet_listener(struct starpu_codelet *cl, struct starpu_perf_counter_listener *listener)
+{
+	STARPU_ASSERT(cl->perf_counter_values == NULL);
+	_STARPU_CALLOC(cl->perf_counter_values, 1, sizeof(*cl->perf_counter_values));
+
+	STARPU_ASSERT(cl->perf_counter_sample == NULL);
+	_STARPU_MALLOC(cl->perf_counter_sample, sizeof(*cl->perf_counter_sample));
+	_starpu_perf_counter_sample_init(cl->perf_counter_sample, starpu_perf_counter_scope_per_codelet);
+	set_listener(cl->perf_counter_sample, listener);
+}
+
+/* - */
+
+void unset_listener(struct starpu_perf_counter_sample *sample)
+{
+	_starpu_spin_lock(&sample->lock);
+	STARPU_ASSERT(sample->listener != NULL);
+
+	memset(sample->value_array, 0, sample->listener->set->size * sizeof(*sample->value_array));
+	free(sample->value_array);
+	sample->value_array = NULL;
+	sample->listener = NULL;
+	_starpu_spin_unlock(&sample->lock);
+}
+
+void starpu_perf_counter_unset_global_listener()
+{
+	unset_listener(&global_sample);
+}
+
+void starpu_perf_counter_unset_per_worker_listener(unsigned workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	unset_listener(&worker->perf_counter_sample);
+}
+
+void starpu_perf_counter_unset_all_per_worker_listeners(void)
+{
+	unsigned nworkers = _starpu_worker_get_count();
+	unsigned workerid;
+	for (workerid = 0; workerid < nworkers; workerid++)
+	{
+		starpu_perf_counter_unset_per_worker_listener(workerid);
+	}
+}
+
+void starpu_perf_counter_unset_per_codelet_listener(struct starpu_codelet *cl)
+{
+	STARPU_ASSERT(cl->perf_counter_sample != NULL);
+	unset_listener(cl->perf_counter_sample);
+	_starpu_perf_counter_sample_exit(cl->perf_counter_sample);
+	free(cl->perf_counter_sample);
+	cl->perf_counter_sample = NULL;
+	free(cl->perf_counter_values);
+	cl->perf_counter_values = NULL;
+}
+
+/* - */
+
+void _starpu_perf_counter_register_updater(enum starpu_perf_counter_scope scope, void (*updater)(struct starpu_perf_counter_sample *sample, void *context))
+{
+	STARPU_ASSERT(!_starpu_machine_is_running());
+
+	struct perf_counter_array *counters = _get_counters(scope);
+	int upd_id;
+	upd_id = counters->updater_array_size++;
+	_STARPU_REALLOC(counters->updater_array, counters->updater_array_size * sizeof(*counters->updater_array));
+	counters->updater_array[upd_id] = updater;
+}
+
+/* - */
+
+static void update_sample(struct starpu_perf_counter_sample *sample, void *context)
+{
+	_starpu_spin_lock(&sample->lock);
+	struct perf_counter_array *counters = _get_counters(sample->scope);
+
+	/* for now, we assume that a sample will only be updated if it has a listener plugged, with a non-empty set */
+	if (sample->listener != NULL && sample->listener->set != NULL)
+	{
+		if (counters->updater_array_size > 0)
+		{
+			int upd_id;
+			for (upd_id = 0; upd_id < counters->updater_array_size; upd_id++)
+			{
+				counters->updater_array[upd_id](sample, context);
+			}
+
+			if (sample->listener != NULL)
+			{
+				sample->listener->callback(sample->listener, sample, context);
+			}
+		}
+	}
+	_starpu_spin_unlock(&sample->lock);
+}
+
+void _starpu_perf_counter_update_global_sample(void)
+{
+	update_sample(&global_sample, NULL);
+}
+
+void _starpu_perf_counter_update_per_worker_sample(unsigned workerid)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	update_sample(&worker->perf_counter_sample, worker);
+}
+
+void _starpu_perf_counter_update_per_codelet_sample(struct starpu_codelet *cl)
+{
+	update_sample(cl->perf_counter_sample, cl);
+}
+
+#define STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE(STRING, TYPE) \
+TYPE starpu_perf_counter_sample_get_##STRING##_value(struct starpu_perf_counter_sample *sample, const int counter_id) \
+{ \
+	STARPU_ASSERT(starpu_perf_counter_get_type_id(counter_id) == starpu_perf_counter_type_##STRING); \
+	STARPU_ASSERT(sample->listener != NULL && sample->listener->set != NULL); \
+	STARPU_ASSERT(_starpu_perf_counter_id_get_scope(counter_id) == sample->listener->set->scope); \
+ \
+	const struct starpu_perf_counter_set * const set = sample->listener->set; \
+	const int index =  _starpu_perf_counter_id_get_index(counter_id); \
+	STARPU_ASSERT(index < set->size); \
+	STARPU_ASSERT(set->index_array[index] > 0); \
+	return sample->value_array[index].STRING##_val; \
+}
+STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE(int32, int32_t);
+STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE(int64, int64_t);
+STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE(float, float);
+STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE(double, double);
+#undef STARPU_PERF_COUNTER_SAMPLE_GET_TYPED_VALUE
+

+ 255 - 0
src/common/knobs.h

@@ -0,0 +1,255 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/* Performance counters and configurable knobs */
+
+#ifndef __KNOBS_H__
+#define __KNOBS_H__
+
+#include <stdint.h>
+#include <starpu.h>
+#include <common/config.h>
+
+#define STARPU_ASSERT_PERF_COUNTER_SCOPE_DEFINED(t) STARPU_ASSERT( \
+		(t == starpu_perf_counter_scope_global ) \
+		|| (t == starpu_perf_counter_scope_per_worker ) \
+		|| (t == starpu_perf_counter_scope_per_codelet ) \
+	)
+
+
+#define STARPU_ASSERT_PERF_COUNTER_TYPE_DEFINED(t) STARPU_ASSERT( \
+		(t == starpu_perf_counter_type_int32 ) \
+		|| (t == starpu_perf_counter_type_int64 ) \
+		|| (t == starpu_perf_counter_type_float ) \
+		|| (t == starpu_perf_counter_type_double ) \
+	)
+
+#define _STARPU_PERF_COUNTER_ID_SCOPE_BITS 4
+
+struct starpu_perf_counter_sample;
+struct _starpu_worker;
+
+#ifdef STARPU_HAVE_XCHG
+#define __STARPU_PERF_COUNTER_UPDATE_32BIT(OPNAME,OP,TYPENAME,TYPE) \
+static inline void _starpu_perf_counter_update_##OPNAME##_##TYPENAME(TYPE *ptr, TYPE value) \
+{ \
+	STARPU_ASSERT(sizeof(TYPE) == sizeof(uint32_t)); \
+	typedef uint32_t __attribute__((__may_alias__)) alias_uint32_t; \
+	typedef TYPE __attribute__((__may_alias__)) alias_##TYPE; \
+	while(1) \
+	{ \
+		uint32_t raw_old = starpu_xchg((uint32_t *)ptr, *(alias_uint32_t*)&value); \
+		if (value OP *(alias_##TYPE*)&raw_old) \
+			break; \
+		value = *(alias_##TYPE*)&raw_old; \
+	} \
+}
+
+#define __STARPU_PERF_COUNTER_UPDATE_64BIT(OPNAME,OP,TYPENAME,TYPE) \
+static inline void _starpu_perf_counter_update_##OPNAME##_##TYPENAME(TYPE *ptr, TYPE value) \
+{ \
+	STARPU_ASSERT(sizeof(TYPE) == sizeof(uint64_t)); \
+	typedef uint64_t __attribute__((__may_alias__)) alias_uint64_t; \
+	typedef TYPE __attribute__((__may_alias__)) alias_##TYPE; \
+	while(1) \
+	{ \
+		uint64_t raw_old = starpu_xchgl((uint64_t *)ptr, *(alias_uint64_t*)&value); \
+		if (value OP *(alias_##TYPE*)&raw_old) \
+			break; \
+		value = *(alias_##TYPE*)&raw_old; \
+	} \
+}
+
+/* Atomic max */
+__STARPU_PERF_COUNTER_UPDATE_32BIT(max,>=,int32,int32_t);
+__STARPU_PERF_COUNTER_UPDATE_32BIT(max,>=,float,float);
+__STARPU_PERF_COUNTER_UPDATE_64BIT(max,>=,int64,int64_t);
+__STARPU_PERF_COUNTER_UPDATE_64BIT(max,>=,double,double);
+
+/* Atomic min */
+__STARPU_PERF_COUNTER_UPDATE_32BIT(min,<=,int32,int32_t);
+__STARPU_PERF_COUNTER_UPDATE_32BIT(min,<=,float,float);
+__STARPU_PERF_COUNTER_UPDATE_64BIT(min,<=,int64,int64_t);
+__STARPU_PERF_COUNTER_UPDATE_64BIT(min,<=,double,double);
+
+#undef __STARPU_PERF_COUNTER_UPDATE_32BIT
+#undef __STARPU_PERF_COUNTER_UPDATE_64BIT
+
+/* Floating point atomic accumulate */
+static inline void _starpu_perf_counter_update_acc_float(float *ptr, float acc_value)
+{
+	STARPU_ASSERT(sizeof(float) == sizeof(uint32_t));
+	typedef uint32_t __attribute__((__may_alias__)) alias_uint32_t;
+	typedef float    __attribute__((__may_alias__)) alias_float;
+	alias_uint32_t raw_old = STARPU_ATOMIC_ADD((alias_uint32_t*)ptr, 0);
+	while(1)
+	{
+		float value = acc_value + *(alias_float*)&raw_old;
+		raw_old = starpu_xchg((alias_uint32_t *)ptr, *(alias_uint32_t*)&value);
+		if (value == acc_value + *(alias_float*)&raw_old)
+			break;
+	}
+}
+static inline void _starpu_perf_counter_update_acc_double(double *ptr, double acc_value)
+{
+	STARPU_ASSERT(sizeof(double) == sizeof(uint64_t));
+	typedef uint64_t __attribute__((__may_alias__)) alias_uint64_t;
+	typedef double   __attribute__((__may_alias__)) alias_double;
+	alias_uint64_t raw_old = STARPU_ATOMIC_ADDL((alias_uint64_t*)ptr, 0);
+	while(1)
+	{
+		double value = acc_value + *(alias_double*)&raw_old;
+		raw_old = starpu_xchgl((alias_uint64_t *)ptr, *(alias_uint64_t*)&value);
+		if (value == acc_value + *(alias_double*)&raw_old)
+			break;
+	}
+}
+#else
+#error TODO: implement fallback when locked exchange is not available
+#endif
+
+struct starpu_perf_counter
+{
+	int id;
+	const char *name;
+	const char *help;
+	enum starpu_perf_counter_type type;
+};
+
+struct starpu_perf_counter_set
+{
+	enum starpu_perf_counter_scope scope;
+	int size;
+	int *index_array;
+};
+
+union starpu_perf_counter_value
+{
+	int32_t int32_val;
+	int64_t int64_val;
+	float float_val;
+	double double_val;
+};
+
+struct starpu_perf_counter_listener
+{
+	struct starpu_perf_counter_set *set;
+	void (*callback)(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context);
+	void *user_arg;
+};
+
+struct starpu_perf_counter_sample
+{
+	enum starpu_perf_counter_scope scope;
+	struct starpu_perf_counter_listener *listener;
+	union starpu_perf_counter_value *value_array;
+	struct _starpu_spinlock lock;
+};
+
+struct starpu_perf_counter_sample_cl_values
+{
+	struct
+	{
+		int32_t total_submitted;
+		int32_t peak_submitted;
+		int32_t current_submitted;
+		int32_t peak_ready;
+		int32_t current_ready;
+		int32_t total_executed;
+		double cumul_execution_time;
+	} task;
+};
+
+typedef void (*starpu_perf_counter_sample_updater)(struct starpu_perf_counter_sample *sample, void *context);
+
+static inline int _starpu_perf_counter_id_get_scope(const int counter_id)
+{
+	STARPU_ASSERT(counter_id >= 0);
+	return counter_id & ((1 << _STARPU_PERF_COUNTER_ID_SCOPE_BITS) - 1);
+}
+
+static inline int _starpu_perf_counter_id_get_index(const int counter_id)
+{
+	STARPU_ASSERT(counter_id >= 0);
+	return counter_id >> _STARPU_PERF_COUNTER_ID_SCOPE_BITS;
+}
+
+static inline int _starpu_perf_counter_id_build(const enum starpu_perf_counter_scope scope, const int index)
+{
+	STARPU_ASSERT_PERF_COUNTER_SCOPE_DEFINED(scope);
+	STARPU_ASSERT(index >= 0);
+	return (index << _STARPU_PERF_COUNTER_ID_SCOPE_BITS) | scope;
+}
+
+
+void _starpu_perf_counter_sample_init(struct starpu_perf_counter_sample *sample, enum starpu_perf_counter_scope scope);
+void _starpu_perf_counter_sample_exit(struct starpu_perf_counter_sample *sample);
+void _starpu_perf_counter_init(void);
+void _starpu_perf_counter_exit(void);
+
+int _starpu_perf_counter_register(enum starpu_perf_counter_scope scope, const char *name, enum starpu_perf_counter_type type, const char *help);
+void _starpu_perf_counter_unregister_all_scopes(void);
+
+void _starpu_perf_counter_register_updater(enum starpu_perf_counter_scope scope, void (*updater)(struct starpu_perf_counter_sample *sample, void *context));
+
+void _starpu_perf_counter_update_global_sample(void);
+void _starpu_perf_counter_update_per_worker_sample(unsigned workerid);
+void _starpu_perf_counter_update_per_codelet_sample(struct starpu_codelet *cl);
+
+#define __STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE(STRING, TYPE) \
+static inline void _starpu_perf_counter_sample_set_##STRING##_value(struct starpu_perf_counter_sample *sample, const int counter_id, const TYPE value) \
+{ \
+	STARPU_ASSERT(starpu_perf_counter_get_type_id(counter_id) == starpu_perf_counter_type_##STRING); \
+	STARPU_ASSERT(sample->listener != NULL && sample->listener->set != NULL); \
+	STARPU_ASSERT(_starpu_perf_counter_id_get_scope(counter_id) == sample->listener->set->scope); \
+ \
+	const struct starpu_perf_counter_set * const set = sample->listener->set; \
+	const int index =  _starpu_perf_counter_id_get_index(counter_id); \
+	STARPU_ASSERT(index < set->size); \
+	if (set->index_array[index] > 0) \
+	{ \
+		sample->value_array[index].STRING##_val = value; \
+	} \
+}
+
+__STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE(int32, int32_t);
+__STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE(int64, int64_t);
+__STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE(float, float);
+__STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE(double, double);
+
+#undef __STARPU_PERF_COUNTER_SAMPLE_SET_TYPED_VALUE
+
+#define __STARPU_PERF_COUNTER_REG(PREFIX, SCOPE, CTR, TYPESTRING, HELP) \
+	do \
+		{ \
+			__##CTR =  _starpu_perf_counter_register(SCOPE, \
+					PREFIX "." #CTR, starpu_perf_counter_type_ ## TYPESTRING, \
+					HELP); \
+		} \
+	while (0)
+
+/* global counter variables */
+extern int32_t _starpu_task__g_total_submitted__value;
+extern int32_t _starpu_task__g_peak_submitted__value;
+extern int32_t _starpu_task__g_current_submitted__value;
+extern int32_t _starpu_task__g_peak_ready__value;
+extern int32_t _starpu_task__g_current_ready__value;
+
+/* performance counter registration routines per modules */
+void _starpu__task_c__register_counters(void);	/* module: task.c */
+
+
+#endif // __KNOBS_H__

+ 30 - 0
src/core/sched_policy.c

@@ -435,6 +435,27 @@ int _starpu_repush_task(struct _starpu_job *j)
 	unsigned can_push = _starpu_increment_nready_tasks_of_sched_ctx(task->sched_ctx, task->flops, task);
 	STARPU_ASSERT(task->status == STARPU_TASK_BLOCKED || task->status == STARPU_TASK_BLOCKED_ON_TAG || task->status == STARPU_TASK_BLOCKED_ON_TASK || task->status == STARPU_TASK_BLOCKED_ON_DATA);
 	task->status = STARPU_TASK_READY;
+	const unsigned continuation =
+#ifdef STARPU_OPENMP
+		j->continuation
+#else
+		0
+#endif
+		;
+	if (!j->internal && !continuation)
+	{
+		(void) STARPU_ATOMIC_ADD(& _starpu_task__g_current_submitted__value, -1);
+		int32_t value = STARPU_ATOMIC_ADD(& _starpu_task__g_current_ready__value, 1);
+		_starpu_perf_counter_update_max_int32(&_starpu_task__g_peak_ready__value, value);
+		if (task->cl && task->cl->perf_counter_values)
+		{
+			struct starpu_perf_counter_sample_cl_values * const pcv = task->cl->perf_counter_values;
+
+			(void)STARPU_ATOMIC_ADD(&pcv->task.current_submitted, -1);
+			int32_t value = STARPU_ATOMIC_ADD(&pcv->task.current_ready, 1);
+			_starpu_perf_counter_update_max_int32(&pcv->task.peak_ready, value);
+		}
+	}
 	STARPU_AYU_ADDTOTASKQUEUE(j->job_id, -1);
 	/* if the context does not have any workers save the tasks in a temp list */
 	if ((task->cl != NULL && task->where != STARPU_NOWHERE) && (!sched_ctx->is_initial_sched))
@@ -469,6 +490,15 @@ int _starpu_repush_task(struct _starpu_job *j)
 	 * corresponding dependencies */
 	if (task->cl == NULL || task->where == STARPU_NOWHERE)
 	{
+		if (!j->internal)
+		{
+			(void)STARPU_ATOMIC_ADD(& _starpu_task__g_current_ready__value, -1);
+			if (task->cl && task->cl->perf_counter_values)
+			{
+				struct starpu_perf_counter_sample_cl_values * const pcv = task->cl->perf_counter_values;
+				(void)STARPU_ATOMIC_ADD(&pcv->task.current_ready, -1);
+			}
+		}
 		task->status = STARPU_TASK_RUNNING;
 		if (task->prologue_callback_pop_func)
 		{

+ 113 - 2
src/core/task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2018                                Inria
+ * Copyright (C) 2011-2019                                Inria
  * Copyright (C) 2009-2018                                Université de Bordeaux
  * Copyright (C) 2017                                     Erwan Leria
  * Copyright (C) 2010-2018                                CNRS
@@ -31,6 +31,7 @@
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/fxt.h>
+#include <common/knobs.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <math.h>
@@ -44,6 +45,94 @@
 #include <windows.h>
 #endif
 
+/* global counters */
+static int __g_total_submitted;
+static int __g_peak_submitted;
+static int __g_peak_ready;
+
+/* global counter variables */
+int32_t _starpu_task__g_total_submitted__value;
+int32_t _starpu_task__g_peak_submitted__value;
+int32_t _starpu_task__g_current_submitted__value;
+int32_t _starpu_task__g_peak_ready__value;
+int32_t _starpu_task__g_current_ready__value;
+
+/* per-worker counters */
+static int __w_total_executed;
+static int __w_cumul_execution_time;
+
+/* per-codelet counters */
+static int __c_total_submitted;
+static int __c_peak_submitted;
+static int __c_peak_ready;
+static int __c_total_executed;
+static int __c_cumul_execution_time;
+
+static void global_sample_updater(struct starpu_perf_counter_sample *sample, void *context)
+{
+	STARPU_ASSERT(context == NULL); /* no context for the global updater */
+	(void)context;
+
+	_starpu_perf_counter_sample_set_int32_value(sample, __g_total_submitted, _starpu_task__g_total_submitted__value);
+	_starpu_perf_counter_sample_set_int32_value(sample, __g_peak_submitted, _starpu_task__g_peak_submitted__value);
+	_starpu_perf_counter_sample_set_int32_value(sample, __g_peak_ready, _starpu_task__g_peak_ready__value);
+}
+
+static void per_worker_sample_updater(struct starpu_perf_counter_sample *sample, void *context)
+{
+	STARPU_ASSERT(context != NULL);
+	struct _starpu_worker *worker = context;
+
+	_starpu_perf_counter_sample_set_int32_value(sample, __w_total_executed, worker->__w_total_executed__value);
+	_starpu_perf_counter_sample_set_double_value(sample, __w_cumul_execution_time, worker->__w_cumul_execution_time__value);
+}
+
+static void per_codelet_sample_updater(struct starpu_perf_counter_sample *sample, void *context)
+{
+	STARPU_ASSERT(sample->listener != NULL && sample->listener->set != NULL);
+	struct starpu_perf_counter_set *set = sample->listener->set;
+	STARPU_ASSERT(set->scope == starpu_perf_counter_scope_per_codelet);
+	STARPU_ASSERT(context != NULL);
+	struct starpu_codelet *cl = context;
+
+	_starpu_perf_counter_sample_set_int32_value(sample, __c_total_submitted, cl->perf_counter_values->task.total_submitted);
+	_starpu_perf_counter_sample_set_int32_value(sample, __c_peak_submitted, cl->perf_counter_values->task.peak_submitted);
+	_starpu_perf_counter_sample_set_int32_value(sample, __c_peak_ready, cl->perf_counter_values->task.peak_ready);
+	_starpu_perf_counter_sample_set_int32_value(sample, __c_total_executed, cl->perf_counter_values->task.total_executed);
+	_starpu_perf_counter_sample_set_double_value(sample, __c_cumul_execution_time, cl->perf_counter_values->task.cumul_execution_time);
+}
+
+void _starpu__task_c__register_counters(void)
+{
+	{
+		const enum starpu_perf_counter_scope scope = starpu_perf_counter_scope_global;
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, g_total_submitted, int32, "number of tasks submitted globally (since StarPU initialization)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, g_peak_submitted, int32, "maximum simultaneous number of tasks submitted and not yet ready, globally (since StarPU initialization)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, g_peak_ready, int32, "maximum simultaneous number of tasks ready and not yet executing, globally (since StarPU initialization)");
+
+		_starpu_perf_counter_register_updater(scope, global_sample_updater);
+	}
+
+	{
+		const enum starpu_perf_counter_scope scope = starpu_perf_counter_scope_per_worker;
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, w_total_executed, int32, "number of tasks executed on this worker (since StarPU initialization)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, w_cumul_execution_time, double, "cumulated execution time of tasks executed on this worker (microseconds, since StarPU initialization)");
+
+		_starpu_perf_counter_register_updater(scope, per_worker_sample_updater);
+	}
+
+	{
+		const enum starpu_perf_counter_scope scope = starpu_perf_counter_scope_per_codelet;
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, c_total_submitted, int32, "number of codelet's task instances submitted using this codelet (since enabled)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, c_peak_submitted, int32, "maximum simultaneous number of codelet's task instances submitted and not yet ready (since enabled)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, c_peak_ready, int32, "maximum simultaneous number of codelet's task instances ready and not yet executing (since enabled)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, c_total_executed, int32, "number of codelet's task instances executed using this codelet (since enabled)");
+		__STARPU_PERF_COUNTER_REG("starpu.task", scope, c_cumul_execution_time, double, "cumulated execution time of codelet's task instances (since enabled)");
+
+		_starpu_perf_counter_register_updater(scope, per_codelet_sample_updater);
+	}
+}
+
 /* XXX this should be reinitialized when StarPU is shutdown (or we should make
  * sure that no task remains !) */
 /* TODO we could make this hierarchical to avoid contention ? */
@@ -250,6 +339,7 @@ int starpu_task_wait(struct starpu_task *task)
 	if (task->destroy)
 		_starpu_task_destroy(task);
 
+	_starpu_perf_counter_update_global_sample();
 	_STARPU_TRACE_TASK_WAIT_END();
         _STARPU_LOG_OUT();
 	return 0;
@@ -661,6 +751,23 @@ int starpu_task_submit(struct starpu_task *task)
 		0
 #endif
 		;
+	if (!j->internal && !continuation)
+	{
+		(void) STARPU_ATOMIC_ADD(&_starpu_task__g_total_submitted__value, 1);
+		int32_t value = STARPU_ATOMIC_ADD(&_starpu_task__g_current_submitted__value, 1);
+		_starpu_perf_counter_update_max_int32(&_starpu_task__g_peak_submitted__value, value);
+		_starpu_perf_counter_update_global_sample();
+
+		if (task->cl && task->cl->perf_counter_values)
+		{
+			struct starpu_perf_counter_sample_cl_values * const pcv = task->cl->perf_counter_values;
+
+			(void) STARPU_ATOMIC_ADD(&pcv->task.total_submitted, 1);
+			int32_t value = STARPU_ATOMIC_ADD(&pcv->task.current_submitted, 1);
+			_starpu_perf_counter_update_max_int32(&pcv->task.peak_submitted, value);
+			_starpu_perf_counter_update_per_codelet_sample(task->cl);
+		}
+	}
 
 	if (!j->internal)
 	{
@@ -933,6 +1040,7 @@ int _starpu_task_wait_for_all_and_return_nb_waited_tasks(void)
 int starpu_task_wait_for_all(void)
 {
 	_starpu_task_wait_for_all_and_return_nb_waited_tasks();
+	_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -949,6 +1057,7 @@ int _starpu_task_wait_for_all_in_ctx_and_return_nb_waited_tasks(unsigned sched_c
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 {
 	_starpu_task_wait_for_all_in_ctx_and_return_nb_waited_tasks(sched_ctx);
+	_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -984,13 +1093,13 @@ int starpu_task_wait_for_n_submitted(unsigned n)
 			}
 		}
 
-		return 0;
 	}
 	else
 	{
 		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
 		_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx_id, n);
 	}
+	_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -998,6 +1107,7 @@ int starpu_task_wait_for_n_submitted_in_ctx(unsigned sched_ctx, unsigned n)
 {
 	_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx, n);
 
+	_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 /*
@@ -1033,6 +1143,7 @@ int starpu_task_wait_for_no_ready(void)
 		}
 	}
 
+	_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 

+ 5 - 1
src/core/workers.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2018                                Inria
+ * Copyright (C) 2010-2019                                Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -38,6 +38,7 @@
 #include <profiling/profiling.h>
 #include <sched_policies/sched_component.h>
 #include <datawizard/memory_nodes.h>
+#include <common/knobs.h>
 #include <top/starpu_top_core.h>
 #include <drivers/mp_common/sink_common.h>
 #include <drivers/scc/driver_scc_common.h>
@@ -619,6 +620,7 @@ void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu_machin
 	workerarg->state_unblock_in_parallel_req = 0;
 	workerarg->state_unblock_in_parallel_ack = 0;
 	workerarg->block_in_parallel_ref_count = 0;
+	_starpu_perf_counter_sample_init(&workerarg->perf_counter_sample, starpu_perf_counter_scope_per_worker);
 
 	/* cpu_set/hwloc_cpu_set/hwloc_obj initialized in topology.c */
 }
@@ -631,6 +633,7 @@ static void _starpu_worker_deinit(struct _starpu_worker *workerarg)
 	starpu_pthread_queue_unregister(&workerarg->wait, &_starpu_simgrid_task_queue[workerarg->workerid]);
 	starpu_pthread_wait_destroy(&workerarg->wait);
 #endif
+	_starpu_perf_counter_sample_exit(&workerarg->perf_counter_sample);
 }
 
 #ifdef STARPU_USE_FXT
@@ -1459,6 +1462,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	}
 
 	_starpu_initialize_registered_performance_models();
+	_starpu_perf_counter_init();
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	_starpu_cuda_init();

+ 8 - 1
src/core/workers.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2017                                Inria
+ * Copyright (C) 2011-2017,2019                           Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -42,6 +42,7 @@
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 #endif
+#include <common/knobs.h>
 
 #include <core/drivers.h>
 #include <drivers/cuda/driver_cuda.h>
@@ -199,6 +200,11 @@ LIST_TYPE(_starpu_worker,
 	hwloc_bitmap_t hwloc_cpu_set;
 	hwloc_obj_t hwloc_obj;
 #endif
+
+	struct starpu_perf_counter_sample perf_counter_sample;
+	int32_t __w_total_executed__value;
+	double __w_cumul_execution_time__value;
+
 );
 
 struct _starpu_combined_worker
@@ -361,6 +367,7 @@ struct _starpu_machine_topology
 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
 
 	unsigned workers_mpi_ms_deviceid[STARPU_NMAXWORKERS];
+
 };
 
 struct _starpu_machine_config

+ 2 - 1
src/drivers/cpu/driver_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011,2012,2014-2017                      Inria
+ * Copyright (C) 2011,2012,2014-2017,2019                 Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017,2019                           CNRS
@@ -44,6 +44,7 @@
 #include <core/simgrid.h>
 #include <core/task.h>
 #include <core/disk.h>
+#include <common/knobs.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>

+ 2 - 1
src/drivers/cuda/driver_cuda.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011,2012,2014,2016,2017                 Inria
+ * Copyright (C) 2011,2012,2014,2016,2017,2019            Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017,2019                           CNRS
@@ -40,6 +40,7 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <core/task.h>
+#include <common/knobs.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>

+ 25 - 7
src/drivers/driver_common/driver_common.c

@@ -57,16 +57,24 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 	if (rank == 0)
 	{
 		STARPU_ASSERT(task->status == STARPU_TASK_READY);
+		if (!j->internal)
+		{
+			(void)STARPU_ATOMIC_ADD(& _starpu_task__g_current_ready__value, -1);
+			if (task->cl && task->cl->perf_counter_values)
+			{
+				struct starpu_perf_counter_sample_cl_values * const pcv = task->cl->perf_counter_values;
+				(void)STARPU_ATOMIC_ADD(&pcv->task.current_ready, -1);
+			}
+		}
 		task->status = STARPU_TASK_RUNNING;
 
 		STARPU_AYU_RUNTASK(j->job_id);
 		cl->per_worker_stats[workerid]++;
 
+		_starpu_clock_gettime(&worker->cl_start);
 		struct starpu_profiling_task_info *profiling_info = task->profiling_info;
-
 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
 		{
-			_starpu_clock_gettime(&worker->cl_start);
 			_starpu_worker_register_executing_start_date(workerid, &worker->cl_start);
 		}
 		_starpu_job_notify_start(j, perf_arch);
@@ -143,10 +151,10 @@ void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j
 
 	if (rank == 0)
 	{
+		_starpu_clock_gettime(&worker->cl_end);
 		struct starpu_profiling_task_info *profiling_info = task->profiling_info;
 		if ((profiling && profiling_info) || calibrate_model || starpu_top)
 		{
-			_starpu_clock_gettime(&worker->cl_end);
 			_starpu_worker_register_executing_end(workerid);
 		}
 		STARPU_AYU_POSTRUNTASK(j->job_id);
@@ -197,12 +205,22 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 		calibrate_model = 1;
 #endif
 
-	if ((profiling && profiling_info) || calibrate_model)
+	starpu_timespec_sub(&worker->cl_end, &worker->cl_start, &measured_ts);
+	double measured = starpu_timing_timespec_to_us(&measured_ts);
+
+	worker->__w_total_executed__value++;
+	worker->__w_cumul_execution_time__value += measured;
+	_starpu_perf_counter_update_per_worker_sample(worker->workerid);
+	if (cl->perf_counter_values)
 	{
-		double measured;
+		struct starpu_perf_counter_sample_cl_values * const pcv = cl->perf_counter_values;
+		(void)STARPU_ATOMIC_ADD(&pcv->task.total_executed, 1);
+		_starpu_perf_counter_update_acc_double(&pcv->task.cumul_execution_time, measured);
+		_starpu_perf_counter_update_per_codelet_sample(cl);
+	}
 
-		starpu_timespec_sub(&worker->cl_end, &worker->cl_start, &measured_ts);
-		measured = starpu_timing_timespec_to_us(&measured_ts);
+	if ((profiling && profiling_info) || calibrate_model)
+	{
 		STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
 
 		if (profiling && profiling_info)

+ 7 - 2
src/drivers/gordon/driver_gordon.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2008-2015,2017,2018                      Université de Bordeaux
- * Copyright (C) 2011,2012                                Inria
+ * Copyright (C) 2011,2012,2019                           Inria
  * Copyright (C) 2010,2011,2013,2015-2017,2019            CNRS
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -26,6 +26,7 @@
 #include "driver_gordon.h"
 #include "gordon_interface.h"
 #include <core/sched_policy.h>
+#include <common/knobs.h>
 
 static unsigned progress_thread_is_inited = 0;
 
@@ -210,9 +211,13 @@ static void gordon_callback_list_func(void *arg)
 #ifndef STARPU_SIMGRID
 		struct gordon_ppu_job_s * gordon_task = &task_wrapper->gordon_job[task_cnt];
 		struct starpu_perfmodel *model = j->task->cl->model;
+		double measured = (double)gordon_task->measured;
+		worker->__w_total_executed__value++;
+		worker->__w_cumul_execution_time__value += measured;
+		_starpu_perf_counter_update_per_worker_sample(worker->workerid);
+
 		if (model && model->benchmarking)
 		{
-			double measured = (double)gordon_task->measured;
 			unsigned cpuid = 0; /* XXX */
 
 			_starpu_update_perfmodel_history(j, j->task->cl->model, STARPU_GORDON_DEFAULT, cpuid, measured);

+ 2 - 1
src/drivers/mp_common/source_common.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012,2016,2017                           Inria
+ * Copyright (C) 2012,2016,2017,2019                      Inria
  * Copyright (C) 2013-2017,2019                           CNRS
  * Copyright (C) 2013-2015,2017                           Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -30,6 +30,7 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
+#include <common/knobs.h>
 
 #if defined(STARPU_USE_MPI_MASTER_SLAVE) && !defined(STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD)
 struct starpu_save_thread_env

+ 2 - 1
src/drivers/opencl/driver_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011,2012,2014-2017                      Inria
+ * Copyright (C) 2011,2012,2014-2017,2019                 Inria
  * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2017,2019                           CNRS
@@ -34,6 +34,7 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <core/task.h>
+#include <common/knobs.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>