13 年之前 · 3b0a936840
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -163,7 +163,8 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
 
																  *  identifier (as returned by the starpu_worker_get_id() function)
															
 
																  */
															
 
																 int starpu_worker_get_devid(int id);
															
 
																-
															
 
																+void starpu_profiling_init();
															
 
																+	void starpu_display_stats();
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -301,6 +301,7 @@ struct starpu_task *starpu_task_create(void);
 
																  * allocated task results in an undefined behaviour. */
															
 
																 void starpu_task_destroy(struct starpu_task *task);
															
 
																 int starpu_task_submit(struct starpu_task *task);// STARPU_WARN_UNUSED_RESULT;
															
 
																+int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
															
 
																 /* This function blocks until the task was executed. It is not possible to
															
 
																  * synchronize with a task more than once. It is not possible to wait
															
@@ -313,6 +314,8 @@ int starpu_task_wait(struct starpu_task *task);// STARPU_WARN_UNUSED_RESULT;
 
																  * been executed. */
															
 
																 int starpu_task_wait_for_all(void);
															
 
																+int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx);
															
 
																+
															
 
																 /* This function waits until there is no more ready task. */
															
 
																 int starpu_task_wait_for_no_ready(void);
															
--- a/sched_ctx_hypervisor/src/Makefile.am
+++ b/sched_ctx_hypervisor/src/Makefile.am
@@ -31,7 +31,8 @@ libsched_ctx_hypervisor_la_SOURCES = 			\
 
																 	hypervisor_policies/app_driven_policy.c		\
															
 
																 	hypervisor_policies/gflops_rate_policy.c	\
															
 
																 	hypervisor_policies/lp_policy.c			\
															
 
																-	hypervisor_policies/lp2_policy.c	
															
 
																+	hypervisor_policies/lp2_policy.c		\
															
 
																+	hypervisor_policies/lp3_policy.c		
															
 
																 noinst_HEADERS = sched_ctx_hypervisor_intern.h		\
															
 
																 	hypervisor_policies/policy_utils.h
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
@@ -31,7 +31,7 @@ double _get_exp_end(unsigned sched_ctx)
 
																 	struct sched_ctx_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
															
 
																 	double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
															
 
																-	if( elapsed_flops != 0.0)
															
 
																+	if( elapsed_flops >= 1.0)
															
 
																 	{
															
 
																 		double curr_time = starpu_timing_now();
															
 
																 		double elapsed_time = curr_time - sc_w->start_time;
															
@@ -116,7 +116,7 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 
																 			/*if the needed number of workers is to big we only move the number of workers 
															
 
																 			  corresponding to the granularity set by the user */
															
 
																                         int nworkers_to_move = _get_nworkers_to_move(sender_sched_ctx);
															
 
																-
															
 
																+			
															
 
																                         if(sender_nworkers - nworkers_to_move >= sender_config->min_nworkers)
															
 
																                         {
															
 
																                                 unsigned nshared_workers = starpu_get_nshared_workers(sender_sched_ctx, receiver_sched_ctx);
															
@@ -125,7 +125,7 @@ static int* _get_workers_to_move(unsigned sender_sched_ctx, unsigned receiver_sc
 
																                                 if(nworkers_to_move > 0)
															
 
																                                 {
															
 
																-                                        workers = _get_first_workers(sender_sched_ctx, &nworkers_to_move, -1);
															
 
																+                                        workers = _get_first_workers(sender_sched_ctx, &nworkers_to_move, STARPU_ALL);
															
 
																                                         *nworkers = nworkers_to_move;
															
 
																                                 }
															
 
																                         }
															
@@ -278,7 +278,13 @@ static void gflops_rate_resize(unsigned sched_ctx)
 
																 		{
															
 
																 			double fast_flops_left_pct = _get_flops_left_pct(fastest_sched_ctx);
															
 
																 			if(fast_flops_left_pct < 0.8)
															
 
																-				_gflops_rate_resize(fastest_sched_ctx, slowest_sched_ctx, 0);
															
 
																+			{
															
 
																+
															
 
																+				struct sched_ctx_wrapper *sc_w = sched_ctx_hypervisor_get_wrapper(slowest_sched_ctx);
															
 
																+				double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
															
 
																+				if((elapsed_flops/sc_w->total_flops) > 0.1)
															
 
																+					_gflops_rate_resize(fastest_sched_ctx, slowest_sched_ctx, 0);
															
 
																+			}
															
 
																 		}
															
 
																 	}
															
 
																 }
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
@@ -16,20 +16,6 @@
 
																 #include "policy_utils.h"
															
 
																 #include <math.h>
															
 
																-struct bound_task_pool
															
 
																-{
															
 
																-	/* Which codelet has been executed */
															
 
																-	struct starpu_codelet *cl;
															
 
																-	/* Task footprint key */
															
 
																-	uint32_t footprint;
															
 
																-	/* Context the task belongs to */
															
 
																-	unsigned sched_ctx_id;
															
 
																-	/* Number of tasks of this kind */
															
 
																-	unsigned long n;
															
 
																-	/* Other task kinds */
															
 
																-	struct bound_task_pool *next;
															
 
																-};
															
 
																-
															
 
																 static struct bound_task_pool *task_pools, *last;
															
@@ -143,7 +129,7 @@ static void _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt])
 
																 		int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																-		/* Number of task * time > 0.3 * tmax */
															
 
																+		/* ntasks_per_worker*t_tasks < tmax */
															
 
																 		glp_add_rows(lp, nw*ns);
															
 
																 		for(s = 0; s < ns; s++)
															
 
																 		{
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp3_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp3_policy.c
@@ -0,0 +1,388 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011, 2012  INRIA
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "policy_utils.h"
															
 
																+#include <math.h>
															
 
																+
															
 
																+static struct bound_task_pool *task_pools, *last;
															
 
																+
															
 
																+static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
															
 
																+
															
 
																+static void lp3_handle_submitted_job(struct starpu_task *task, unsigned footprint)
															
 
																+{
															
 
																+	pthread_mutex_lock(&mutex);
															
 
																+	struct bound_task_pool *tp;
															
 
																+	
															
 
																+	if (last && last->cl == task->cl && last->footprint == footprint && last->sched_ctx_id == task->sched_ctx)
															
 
																+		tp = last;
															
 
																+	else
															
 
																+		for (tp = task_pools; tp; tp = tp->next)
															
 
																+			if (tp->cl == task->cl && tp->footprint == footprint && tp->sched_ctx_id == task->sched_ctx)
															
 
																+					break;
															
 
																+	
															
 
																+	if (!tp)
															
 
																+	{
															
 
																+		tp = (struct bound_task_pool *) malloc(sizeof(*tp));
															
 
																+		tp->cl = task->cl;
															
 
																+		tp->footprint = footprint;
															
 
																+		tp->sched_ctx_id = task->sched_ctx;
															
 
																+		tp->n = 0;
															
 
																+		tp->next = task_pools;
															
 
																+		task_pools = tp;
															
 
																+	}
															
 
																+	
															
 
																+	/* One more task of this kind */
															
 
																+	tp->n++;
															
 
																+	pthread_mutex_unlock(&mutex);
															
 
																+}
															
 
																+
															
 
																+static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt])
															
 
																+{
															
 
																+        struct bound_task_pool *tp;
															
 
																+        int w, t;
															
 
																+        for (w = 0; w < nw; w++)
															
 
																+        {
															
 
																+                for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+                {
															
 
																+                        enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
															
 
																+                        double length = starpu_history_based_job_expected_perf(tp->cl->model, arch, tp->footprint);
															
 
																+
															
 
																+                        if (isnan(length))
															
 
																+                                times[w][t] = NAN;
															
 
																+                       else
															
 
																+                                times[w][t] = length / 1000.;
															
 
																+			
															
 
																+//			printf("t%d on worker %d ctx %d: %lf \n", t, w, tp->sched_ctx_id, times[w][t]);
															
 
																+                }
															
 
																+//		printf("\n");
															
 
																+        }
															
 
																+//	printf("\n");
															
 
																+}
															
 
																+
															
 
																+/*                                                                                                                                                                                                                  
															
 
																+ * GNU Linear Programming Kit backend                                                                                                                                                                               
															
 
																+ */
															
 
																+#ifdef HAVE_GLPK_H
															
 
																+#include <glpk.h>
															
 
																+static double _glp_resolve(int ns, int nw, int nt, double tasks[nw][nt], double tmax, double w_in_s[ns][nw])
															
 
																+{
															
 
																+	struct bound_task_pool * tp;
															
 
																+	int t, w, s;
															
 
																+	glp_prob *lp;
															
 
																+
															
 
																+	lp = glp_create_prob();
															
 
																+	glp_set_prob_name(lp, "StarPU theoretical bound");
															
 
																+	glp_set_obj_dir(lp, GLP_MAX);
															
 
																+	glp_set_obj_name(lp, "total execution time");
															
 
																+
															
 
																+	{
															
 
																+		double times[nw][nt];
															
 
																+		int ne =
															
 
																+//			nw * (nt+1)	/* worker execution time */
															
 
																+			+ nt * nw
															
 
																+			+ nw * (nt+ns)
															
 
																+			+ nw * ns
															
 
																+			+ 1; /* glp dumbness */
															
 
																+		int n = 1;
															
 
																+		int ia[ne], ja[ne];
															
 
																+		double ar[ne];
															
 
																+
															
 
																+		_starpu_get_tasks_times(nw, nt, times);
															
 
																+
															
 
																+		/* Variables: number of tasks i assigned to worker j, and tmax */
															
 
																+		glp_add_cols(lp, nw*nt+ns*nw);
															
 
																+#define colnum(w, t) ((t)*nw+(w)+1)
															
 
																+		for(s = 0; s < ns; s++)
															
 
																+			for(w = 0; w < nw; w++)
															
 
																+				glp_set_obj_coef(lp, nw*nt+s*nw+w+1, 1.);
															
 
																+
															
 
																+		for (w = 0; w < nw; w++)
															
 
																+			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+			{
															
 
																+				char name[32];
															
 
																+				snprintf(name, sizeof(name), "w%dt%dn", w, t);
															
 
																+				glp_set_col_name(lp, colnum(w, t), name);
															
 
																+				glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
															
 
																+			}
															
 
																+		for(s = 0; s < ns; s++)
															
 
																+			for(w = 0; w < nw; w++)
															
 
																+			{
															
 
																+				char name[32];
															
 
																+				snprintf(name, sizeof(name), "w%ds%dn", w, s);
															
 
																+				glp_set_col_name(lp, nw*nt+s*nw+w+1, name);	
															
 
																+				glp_set_col_bnds(lp, nw*nt+s*nw+w+1, GLP_LO, 0., 0.);
															
 
																+			}
															
 
																+
															
 
																+		int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																+
															
 
																+		int curr_row_idx = 0;
															
 
																+		/* Total worker execution time */
															
 
																+		glp_add_rows(lp, nw*ns);
															
 
																+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		{
															
 
																+			int someone = 0;
															
 
																+			for (w = 0; w < nw; w++)
															
 
																+				if (!isnan(times[w][t]))
															
 
																+					someone = 1;
															
 
																+			if (!someone)
															
 
																+			{
															
 
																+				/* This task does not have any performance model at all, abort */
															
 
																+				glp_delete_prob(lp);
															
 
																+				return 0.0;
															
 
																+			}
															
 
																+		}
															
 
																+		for(s = 0; s < ns; s++)
															
 
																+		{
															
 
																+			for (w = 0; w < nw; w++)
															
 
																+			{
															
 
																+				char name[32], title[64];
															
 
																+				starpu_worker_get_name(w, name, sizeof(name));
															
 
																+				snprintf(title, sizeof(title), "worker %s", name);
															
 
																+				glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
															
 
																+				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+				{
															
 
																+					if(tp->sched_ctx_id == sched_ctxs[s])
															
 
																+					{
															
 
																+						ia[n] = curr_row_idx+s*nw+w+1;
															
 
																+						ja[n] = colnum(w, t);
															
 
																+						if (isnan(times[w][t]))
															
 
																+							ar[n] = 1000000000.;
															
 
																+						else
															
 
																+							ar[n] = times[w][t];
															
 
																+						n++;
															
 
																+					}
															
 
																+				}
															
 
																+				/* x[s][w] = 1 | 0 */
															
 
																+				ia[n] = curr_row_idx+s*nw+w+1;
															
 
																+				ja[n] = nw*nt+s*nw+w+1;
															
 
																+				ar[n] = (-1) * tmax;
															
 
																+				n++;
															
 
																+				glp_set_row_bnds(lp, curr_row_idx+s*nw+w+1, GLP_UP, 0, 0);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		curr_row_idx += nw*ns;
															
 
																+
															
 
																+		/* Total task completion */
															
 
																+		glp_add_rows(lp, nt);
															
 
																+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		{
															
 
																+			char name[32], title[64];
															
 
																+			starpu_worker_get_name(w, name, sizeof(name));
															
 
																+			snprintf(title, sizeof(title), "task %s key %x", tp->cl->name, (unsigned) tp->footprint);
															
 
																+			glp_set_row_name(lp, curr_row_idx+t+1, title);
															
 
																+			for (w = 0; w < nw; w++)
															
 
																+			{
															
 
																+				ia[n] = curr_row_idx+t+1;
															
 
																+				ja[n] = colnum(w, t);
															
 
																+				ar[n] = 1;
															
 
																+				n++;
															
 
																+			}
															
 
																+			glp_set_row_bnds(lp, curr_row_idx+t+1, GLP_FX, tp->n, tp->n);
															
 
																+		}
															
 
																+
															
 
																+		curr_row_idx += nt;
															
 
																+
															
 
																+		/* sum(x[s][i] */
															
 
																+		glp_add_rows(lp, nw);
															
 
																+		for (w = 0; w < nw; w++)
															
 
																+		{
															
 
																+			char name[32], title[64];
															
 
																+			starpu_worker_get_name(w, name, sizeof(name));
															
 
																+			snprintf(title, sizeof(title), "w%x", w);
															
 
																+			glp_set_row_name(lp, curr_row_idx+w+1, title);
															
 
																+			for(s = 0; s < ns; s++)
															
 
																+			{
															
 
																+				ia[n] = curr_row_idx+w+1;
															
 
																+				ja[n] = nw*nt+s*nw+w+1;
															
 
																+				ar[n] = 1;
															
 
																+				n++;
															
 
																+			}
															
 
																+
															
 
																+			glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
															
 
																+		}
															
 
																+
															
 
																+//		printf("n = %d nw*ns  = %d ne = %d\n", n, nw*ns, ne);
															
 
																+		STARPU_ASSERT(n == ne);
															
 
																+
															
 
																+		glp_load_matrix(lp, ne-1, ia, ja, ar);
															
 
																+	}
															
 
																+
															
 
																+	glp_smcp parm;
															
 
																+	glp_init_smcp(&parm);
															
 
																+	parm.msg_lev = GLP_MSG_OFF;
															
 
																+	int ret = glp_simplex(lp, &parm);
															
 
																+	if (ret)
															
 
																+	{
															
 
																+		glp_delete_prob(lp);
															
 
																+		lp = NULL;
															
 
																+		return 0.0;
															
 
																+	}
															
 
																+
															
 
																+	double res = glp_get_obj_val(lp);
															
 
																+
															
 
																+	printf("Z: %f (must be eq to nw %d)\n", res, nw);
															
 
																+	for (w = 0; w < nw; w++)
															
 
																+	{
															
 
																+		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
															
 
																+		{
															
 
																+			tasks[w][t] = glp_get_col_prim(lp, colnum(w, t));
															
 
																+//			printf("t%d worker %d ctx %d res %lf \n", t, w, tasks[w][t]);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	for(s = 0; s < ns; s++)
															
 
																+		for(w = 0; w < nw; w++)
															
 
																+		{
															
 
																+			w_in_s[s][w] = glp_get_col_prim(lp, nw*nt+s*nw+w);
															
 
																+			printf("worker %d ctx %d res %lf \n", w, s, w_in_s[s][w]);
															
 
																+		}
															
 
																+
															
 
																+	glp_delete_prob(lp);
															
 
																+	return res;
															
 
																+}
															
 
																+
															
 
																+static void _redistribute_resources_in_ctxs(int ns, int nw, int nt, double w_in_s[ns][nw])
															
 
																+{
															
 
																+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																+        struct bound_task_pool * tp;
															
 
																+	int s, s2, w, t;
															
 
																+
															
 
																+	for(s = 0; s < ns; s++)
															
 
																+	{
															
 
																+		int workers_to_add[nw], workers_to_remove[nw];
															
 
																+		for(w = 0; w < nw; w++)
															
 
																+		{
															
 
																+			workers_to_add[w] = -1;
															
 
																+			workers_to_remove[w] = -1;
															
 
																+		}
															
 
																+
															
 
																+		int nadd = 0, nremove = 0;
															
 
																+
															
 
																+		for(w = 0; w < nw; w++)
															
 
																+		{
															
 
																+			if(w_in_s[s][w] >= 0.5)
															
 
																+				workers_to_add[nadd++] = w;
															
 
																+			else
															
 
																+				workers_to_remove[nremove++] = w;
															
 
																+		}
															
 
																+		
															
 
																+		sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_to_remove, nremove, sched_ctxs[s]);
															
 
																+	
															
 
																+		sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_to_add, nadd, sched_ctxs[s]);
															
 
																+		struct policy_config *new_config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
															
 
																+		int i;
															
 
																+		for(i = 0; i < nadd; i++)
															
 
																+			new_config->max_idle[workers_to_add[i]] = new_config->max_idle[workers_to_add[i]] != MAX_IDLE_TIME ? new_config->max_idle[workers_to_add[i]] :  new_config->new_workers_max_idle;
															
 
																+	}
															
 
																+
															
 
																+}
															
 
																+
															
 
																+static int done = 0;
															
 
																+static void lp3_handle_poped_task(unsigned sched_ctx, int worker)
															
 
																+{
															
 
																+	struct sched_ctx_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
															
 
																+	
															
 
																+	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
															
 
																+	if(ret != EBUSY)
															
 
																+	{
															
 
																+		if(sc_w->submitted_flops < sc_w->total_flops)
															
 
																+		{
															
 
																+			pthread_mutex_unlock(&act_hypervisor_mutex);
															
 
																+			return;
															
 
																+		}
															
 
																+
															
 
																+		if(_velocity_gap_btw_ctxs() && !done)
															
 
																+		{
															
 
																+			done = 1;
															
 
																+			int ns = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																+			int nw = starpu_worker_get_count(); /* Number of different workers */
															
 
																+			int nt = 0; /* Number of different kinds of tasks */
															
 
																+			struct bound_task_pool * tp;
															
 
																+			for (tp = task_pools; tp; tp = tp->next)
															
 
																+				nt++;
															
 
																+			
															
 
																+			double tasks[nw][nt];
															
 
																+			double draft_tasks[nw][nt];
															
 
																+			double w_in_s[ns][nw];
															
 
																+			double draft_w_in_s[ns][nw];
															
 
																+
															
 
																+ 			int w,t, s;
															
 
																+			for(w = 0; w < nw; w++)
															
 
																+				for(t = 0; t < nt; t++)
															
 
																+				{
															
 
																+					tasks[w][t] = 0.0;
															
 
																+					draft_tasks[w][t] == 0.0;
															
 
																+				}
															
 
																+
															
 
																+			for(s = 0; s < ns; s++)
															
 
																+				for(w = 0; w < nw; w++)
															
 
																+				{
															
 
																+					w_in_s[s][w] = 0.0;
															
 
																+					draft_w_in_s[s][w] = 0.0;
															
 
																+				}
															
 
																+
															
 
																+			double tmax = 30000;
															
 
																+			double res = 1.0;
															
 
																+			while(tmax >= 1.0)
															
 
																+			{
															
 
																+				printf("resolve for tmax = %lf\n", tmax);
															
 
																+				res = _glp_resolve(ns, nw, nt, draft_tasks, tmax, draft_w_in_s);
															
 
																+				if(res == (double)nw)
															
 
																+				{
															
 
																+					for(w = 0; w < nw; w++)
															
 
																+						for(t = 0; t < nt; t++)
															
 
																+							tasks[w][t] = draft_tasks[w][t];
															
 
																+					for(s = 0; s < ns; s++)
															
 
																+						for(w = 0; w < nw; w++)
															
 
																+							w_in_s[s][w] = draft_w_in_s[s][w];
															
 
																+
															
 
																+				}
															
 
																+				else
															
 
																+				{
															
 
																+					
															
 
																+					printf("break\n");
															
 
																+					break;
															
 
																+				}
															
 
																+				tmax /= 2;
															
 
																+			}
															
 
																+
															
 
																+/* 			for(w = 0; w < nw; w++) */
															
 
																+/* 				for (t = 0, tp = task_pools; tp; t++, tp = tp->next) */
															
 
																+/* 				{ */
															
 
																+/* 					if(tasks[w][t] > 0.0) */
															
 
																+/* 						printf("ctx %d/worker %d/task type %d: res = %lf \n", tp->sched_ctx_id, w, t, tasks[w][t]); */
															
 
																+/* 				} */
															
 
																+
															
 
																+			_redistribute_resources_in_ctxs(ns, nw, nt, w_in_s);
															
 
																+		}
															
 
																+		pthread_mutex_unlock(&act_hypervisor_mutex);
															
 
																+	}		
															
 
																+}
															
 
																+
															
 
																+struct hypervisor_policy lp3_policy = {
															
 
																+	.handle_poped_task = lp3_handle_poped_task,
															
 
																+	.handle_pushed_task = NULL,
															
 
																+	.handle_idle_cycle = NULL,
															
 
																+	.handle_idle_end = NULL,
															
 
																+	.handle_post_exec_hook = NULL,
															
 
																+	.handle_submitted_job = lp3_handle_submitted_job,
															
 
																+	.custom = 0,
															
 
																+	.name = "lp3"
															
 
																+};
															
 
																+	
															
 
																+#endif /* HAVE_GLPK_H */
															
 
																+
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
@@ -160,7 +160,7 @@ static void _glp_resolve(int ns, int nw, double v[ns][nw], double flops[ns], dou
 
																 //	glp_simplex(lp, NULL);
															
 
																 	double vmax1 = glp_get_obj_val(lp);
															
 
																-//	printf("vmax1 = %lf \n", vmax1);
															
 
																+	printf("vmax1 = %lf \n", vmax1);
															
 
																 	n = 1;
															
 
																 	for(s = 0; s < ns; s++)
															
@@ -176,42 +176,7 @@ static void _glp_resolve(int ns, int nw, double v[ns][nw], double flops[ns], dou
 
																 	return;
															
 
																 }
															
 
																-/* check if there is a big velocity gap between the contexts */
															
 
																-int _velocity_gap_btw_ctxs()
															
 
																-{
															
 
																-	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																-	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																-	int i = 0, j = 0;
															
 
																-	struct sched_ctx_wrapper* sc_w;
															
 
																-	struct sched_ctx_wrapper* other_sc_w;
															
 
																-	
															
 
																-	for(i = 0; i < nsched_ctxs; i++)
															
 
																-	{
															
 
																-		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
															
 
																-		double ctx_v = _get_ctx_velocity(sc_w);
															
 
																-		if(ctx_v != 0.0)
															
 
																-		{
															
 
																-			for(j = 0; j < nsched_ctxs; j++)
															
 
																-			{
															
 
																-				if(sched_ctxs[i] != sched_ctxs[j])
															
 
																-				{
															
 
																-					other_sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[j]);
															
 
																-					double other_ctx_v = _get_ctx_velocity(other_sc_w);
															
 
																-					if(other_ctx_v != 0.0)
															
 
																-					{
															
 
																-						double gap = ctx_v < other_ctx_v ? ctx_v / other_ctx_v : other_ctx_v / ctx_v;
															
 
																-						if(gap > 0.5)
															
 
																-							return 1;
															
 
																-					}
															
 
																-				}
															
 
																-			}
															
 
																-		}
															
 
																-
															
 
																-	}
															
 
																-	return 0;
															
 
																-}
															
 
																-
															
 
																-void _round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
															
 
																+static void _round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns][nw])
															
 
																 {
															
 
																 	int s, w;
															
 
																 	double left_res[nw];
															
@@ -267,7 +232,7 @@ void _round_double_to_int(int ns, int nw, double res[ns][nw], int res_rounded[ns
 
																 	}		
															
 
																 }
															
 
																-void _redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
															
 
																+static void _redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], double res[ns][nw])
															
 
																 {
															
 
																 	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																 	int s, s2, w;
															
@@ -375,7 +340,7 @@ void _redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw], do
 
																 	}
															
 
																 }
															
 
																-void lp_handle_poped_task(unsigned sched_ctx, int worker)
															
 
																+static void lp_handle_poped_task(unsigned sched_ctx, int worker)
															
 
																 {
															
 
																 	if(_velocity_gap_btw_ctxs())
															
 
																 	{
															
@@ -395,24 +360,25 @@ void lp_handle_poped_task(unsigned sched_ctx, int worker)
 
																 			v[i][0] = 200.0;//_get_velocity_per_worker_type(sc_w, STARPU_CUDA_WORKER);
															
 
																 			v[i][1] = 20.0;//_get_velocity_per_worker_type(sc_w, STARPU_CPU_WORKER);
															
 
																 			flops[i] = sc_w->remaining_flops/1000000000; //sc_w->total_flops/1000000000; /* in gflops*/
															
 
																+			printf("%d: flops %lf\n", sched_ctxs[i], flops[i]);
															
 
																 		}
															
 
																 		int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
															
 
																 		if(ret != EBUSY)
															
 
																 		{
															
 
																 			_glp_resolve(nsched_ctxs, 2, v, flops, res);
															
 
																-/* 			for( i = 0; i < nsched_ctxs; i++) */
															
 
																-/* 			{ */
															
 
																-/* 				printf("ctx %d/worker type %d: n = %lf \n", i, 0, res[i][0]); */
															
 
																-/* 				printf("ctx %d/worker type %d: n = %lf \n", i, 1, res[i][1]); */
															
 
																-/* 			} */
															
 
																+			for( i = 0; i < nsched_ctxs; i++)
															
 
																+			{
															
 
																+				printf("ctx %d/worker type %d: n = %lf \n", i, 0, res[i][0]);
															
 
																+				printf("ctx %d/worker type %d: n = %lf \n", i, 1, res[i][1]);
															
 
																+			}
															
 
																 			int res_rounded[nsched_ctxs][2];
															
 
																 			_round_double_to_int(nsched_ctxs, 2, res, res_rounded);
															
 
																-/* 			for( i = 0; i < nsched_ctxs; i++) */
															
 
																-/* 			{ */
															
 
																-/* 				printf("ctx %d/worker type %d: n = %d \n", i, 0, res_rounded[i][0]); */
															
 
																-/* 				printf("ctx %d/worker type %d: n = %d \n", i, 1, res_rounded[i][1]); */
															
 
																-/* 			} */
															
 
																+			for( i = 0; i < nsched_ctxs; i++)
															
 
																+			{
															
 
																+				printf("ctx %d/worker type %d: n = %d \n", i, 0, res_rounded[i][0]);
															
 
																+				printf("ctx %d/worker type %d: n = %d \n", i, 1, res_rounded[i][1]);
															
 
																+			}
															
 
																 			_redistribute_resources_in_ctxs(nsched_ctxs, 2, res_rounded, res);
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_utils.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_utils.c
@@ -294,7 +294,7 @@ double _get_ctx_velocity(struct sched_ctx_wrapper* sc_w)
 
																 {
															
 
																         double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
															
 
																-        if( elapsed_flops != 0.0)
															
 
																+        if( elapsed_flops >= 1.0)
															
 
																         {
															
 
																                 double curr_time = starpu_timing_now();
															
 
																                 double elapsed_time = curr_time - sc_w->start_time;
															
@@ -318,3 +318,40 @@ double _get_velocity_per_worker_type(struct sched_ctx_wrapper* sc_w, enum starpu
 
																         return -1.0;
															
 
																 }
															
 
																+
															
 
																+
															
 
																+/* check if there is a big velocity gap between the contexts */
															
 
																+int _velocity_gap_btw_ctxs()
															
 
																+{
															
 
																+	int *sched_ctxs = sched_ctx_hypervisor_get_sched_ctxs();
															
 
																+	int nsched_ctxs = sched_ctx_hypervisor_get_nsched_ctxs();
															
 
																+	int i = 0, j = 0;
															
 
																+	struct sched_ctx_wrapper* sc_w;
															
 
																+	struct sched_ctx_wrapper* other_sc_w;
															
 
																+	
															
 
																+	for(i = 0; i < nsched_ctxs; i++)
															
 
																+	{
															
 
																+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[i]);
															
 
																+		double ctx_v = _get_ctx_velocity(sc_w);
															
 
																+		if(ctx_v != 0.0)
															
 
																+		{
															
 
																+			for(j = 0; j < nsched_ctxs; j++)
															
 
																+			{
															
 
																+				if(sched_ctxs[i] != sched_ctxs[j])
															
 
																+				{
															
 
																+					other_sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[j]);
															
 
																+					double other_ctx_v = _get_ctx_velocity(other_sc_w);
															
 
																+					if(other_ctx_v != 0.0)
															
 
																+					{
															
 
																+						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
															
 
																+//						printf("gap = %lf\n", gap);
															
 
																+						if(gap > 5)
															
 
																+							return 1;
															
 
																+					}
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+	}
															
 
																+	return 0;
															
 
																+}
															
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_utils.h
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_utils.h
@@ -1,6 +1,20 @@
 
																 #include <sched_ctx_hypervisor.h>
															
 
																 #include <pthread.h>
															
 
																+struct bound_task_pool
															
 
																+{
															
 
																+	/* Which codelet has been executed */
															
 
																+	struct starpu_codelet *cl;
															
 
																+	/* Task footprint key */
															
 
																+	uint32_t footprint;
															
 
																+	/* Context the task belongs to */
															
 
																+	unsigned sched_ctx_id;
															
 
																+	/* Number of tasks of this kind */
															
 
																+	unsigned long n;
															
 
																+	/* Other task kinds */
															
 
																+	struct bound_task_pool *next;
															
 
																+};
															
 
																+
															
 
																 unsigned _find_poor_sched_ctx(unsigned req_sched_ctx, int nworkers_to_move);
															
 
																 int* _get_first_workers(unsigned sched_ctx, unsigned *nworkers, enum starpu_archtype arch);
															
@@ -16,3 +30,5 @@ unsigned _resize_to_unknown_receiver(unsigned sender_sched_ctx);
 
																 double _get_ctx_velocity(struct sched_ctx_wrapper* sc_w);
															
 
																 double _get_velocity_per_worker_type(struct sched_ctx_wrapper* sc_w, enum starpu_archtype arch);
															
 
																+
															
 
																+int _velocity_gap_btw_ctxs(void);
															
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -32,6 +32,7 @@ extern struct hypervisor_policy gflops_rate_policy;
 
																 #ifdef HAVE_GLPK_H
															
 
																 extern struct hypervisor_policy lp_policy;
															
 
																 extern struct hypervisor_policy lp2_policy;
															
 
																+extern struct hypervisor_policy lp3_policy;
															
 
																 #endif
															
 
																 static struct hypervisor_policy *predefined_policies[] = {
															
@@ -40,6 +41,7 @@ static struct hypervisor_policy *predefined_policies[] = {
 
																 #ifdef HAVE_GLPK_H
															
 
																 	&lp_policy,
															
 
																 	&lp2_policy,
															
 
																+	&lp3_policy,
															
 
																 #endif
															
 
																 	&gflops_rate_policy
															
 
																 };
															
@@ -336,7 +338,7 @@ int get_nworkers_ctx(unsigned sched_ctx, enum starpu_archtype arch)
 
																 /* forbids another resize request before this one is take into account */
															
 
																 void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move)
															
 
																 {
															
 
																-	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx] && hypervisor.resize[receiver_sched_ctx])
															
 
																+	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])// && hypervisor.resize[receiver_sched_ctx])
															
 
																 	{
															
 
																 		int j;
															
 
																 		printf("resize ctx %d with", sender_sched_ctx);
															
@@ -370,7 +372,7 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
																 			hypervisor.sched_ctx_w[sender_sched_ctx].resize_ack.acked_workers[i] = 0;	
															
 
																 		}
															
 
																-		pthread_mutex_lock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
															
 
																+		pthread_mutex_unlock(&hypervisor.sched_ctx_w[sender_sched_ctx].mutex);
															
 
																 		hypervisor.resize[sender_sched_ctx] = 0;
															
 
																 		hypervisor.resize[receiver_sched_ctx] = 0;
															
@@ -611,7 +613,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flo
 
																 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
															
 
																 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
															
 
																-	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
															
 
																+	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
															
 
																 	if(hypervisor.nsched_ctxs > 1)
															
 
																 	{
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -361,8 +361,10 @@ int starpu_task_submit(struct starpu_task *task)
 
																 	STARPU_ASSERT(task->magic == 42);
															
 
																 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
															
 
																-	task->sched_ctx = (nsched_ctxs == 1 || task->control_task) ? 
															
 
																-		0 : starpu_get_sched_ctx();
															
 
																+	if(task->sched_ctx == 0 && nsched_ctxs != 1 && !task->control_task)
															
 
																+		task->sched_ctx = starpu_get_sched_ctx();
															
 
																+//	task->sched_ctx = (nsched_ctxs == 1 || task->control_task) ? 
															
 
																+//	   0 : starpu_get_sched_ctx());
															
 
																 	int ret;
															
 
																 	unsigned is_sync = task->synchronous;
															
 
																         _STARPU_LOG_IN();
															
@@ -459,6 +461,13 @@ int _starpu_task_submit_internally(struct starpu_task *task)
 
																 	return starpu_task_submit(task);
															
 
																 }
															
 
																+/* application should submit new tasks to StarPU through this function */
															
 
																+int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id)
															
 
																+{
															
 
																+	task->sched_ctx = sched_ctx_id;
															
 
																+	starpu_task_submit(task);
															
 
																+}
															
 
																+
															
 
																 /* The StarPU core can submit tasks directly to the scheduler or a worker,
															
 
																  * skipping dependencies completely (when it knows what it is doing).  */
															
 
																 int _starpu_task_submit_nodeps(struct starpu_task *task)
															
@@ -594,6 +603,11 @@ int starpu_task_wait_for_all(void)
 
																 	return 0;
															
 
																 }
															
 
																+int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
															
 
																+{
															
 
																+	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
															
 
																+	return 0;
															
 
																+}
															
 
																 /*
															
 
																  * We wait until there is no ready task any more (i.e. StarPU will not be able
															
 
																  * to progress any more).
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -482,7 +482,7 @@ int starpu_init(struct starpu_conf *user_conf)
 
																 	_starpu_timing_init();
															
 
																-	_starpu_profiling_init();
															
 
																+//	_starpu_profiling_init();
															
 
																 	_starpu_load_bus_performance_files();
															
@@ -542,6 +542,10 @@ int starpu_init(struct starpu_conf *user_conf)
 
																 	return 0;
															
 
																 }
															
 
																+void starpu_profiling_init()
															
 
																+{
															
 
																+	_starpu_profiling_init();
															
 
																+}
															
 
																 /*
															
 
																  * Handle runtime termination
															
 
																  */
															
@@ -635,6 +639,15 @@ static void _starpu_kill_all_workers(struct _starpu_machine_config *config)
 
																 	starpu_wake_all_blocked_workers();
															
 
																 }
															
 
																+void starpu_display_stats()
															
 
																+{
															
 
																+	const char *stats;
															
 
																+	if ((stats = getenv("STARPU_BUS_STATS")) && atoi(stats))
															
 
																+		starpu_bus_profiling_helper_display_summary();
															
 
																+
															
 
																+	if ((stats = getenv("STARPU_WORKER_STATS")) && atoi(stats))
															
 
																+		starpu_worker_profiling_helper_display_summary();
															
 
																+}
															
 
																 void starpu_shutdown(void)
															
 
																 {
															
 
																 	const char *stats;
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -137,7 +137,8 @@ void *_starpu_cpu_worker(void *arg)
 
																 	pthread_cond_t *sched_cond = &cpu_arg->sched_cond;
															
 
																 	pthread_mutex_t *sched_mutex = &cpu_arg->sched_mutex;
															
 
																-
															
 
																+	struct timespec start_time, end_time;
															
 
																+	unsigned idle = 0;
															
 
																 	while (_starpu_machine_is_running())
															
 
																 	{
															
 
																 		_STARPU_TRACE_START_PROGRESS(memnode);
															
@@ -153,11 +154,30 @@ void *_starpu_cpu_worker(void *arg)
 
																 			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 			if (_starpu_worker_can_block(memnode))
															
 
																 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
															
 
																+			else
															
 
																+			{
															
 
																+				_starpu_clock_gettime(&start_time);
															
 
																+				_starpu_worker_register_sleeping_start_date(workerid, &start_time);
															
 
																+				idle = 1;
															
 
																+			}
															
 
																 			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
 
																 			continue;
															
 
																 		};
															
 
																+		if(idle)
															
 
																+		{
															
 
																+			_starpu_clock_gettime(&end_time);
															
 
																+			
															
 
																+			int profiling = starpu_profiling_status_get();
															
 
																+			if (profiling)
															
 
																+			{
															
 
																+				struct timespec sleeping_time;
															
 
																+				starpu_timespec_sub(&end_time, &start_time, &sleeping_time);
															
 
																+				_starpu_worker_update_profiling_info_sleeping(workerid, &start_time, &end_time);
															
 
																+			}
															
 
																+			idle = 0;
															
 
																+		}
															
 
																 		STARPU_ASSERT(task);
															
 
																 		j = _starpu_get_job_associated_to_task(task);
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -309,7 +309,8 @@ void *_starpu_cuda_worker(void *arg)
 
																 	pthread_cond_t *sched_cond = &args->sched_cond;
															
 
																 	pthread_mutex_t *sched_mutex = &args->sched_mutex;
															
 
																-
															
 
																+	struct timespec start_time, end_time;
															
 
																+	unsigned idle = 0;
															
 
																 	while (_starpu_machine_is_running())
															
 
																 	{
															
 
																 		_STARPU_TRACE_START_PROGRESS(memnode);
															
@@ -323,6 +324,12 @@ void *_starpu_cuda_worker(void *arg)
 
																 			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
															
 
																 			if (_starpu_worker_can_block(memnode))
															
 
																 				_starpu_block_worker(workerid, sched_cond, sched_mutex);
															
 
																+			else
															
 
																+			{
															
 
																+				_starpu_clock_gettime(&start_time);
															
 
																+				_starpu_worker_register_sleeping_start_date(workerid, &start_time);
															
 
																+				idle = 1;
															
 
																+			}
															
 
																 			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
															
@@ -330,6 +337,19 @@ void *_starpu_cuda_worker(void *arg)
 
																 			continue;
															
 
																 		};
															
 
																+		if(idle)
															
 
																+		{
															
 
																+			_starpu_clock_gettime(&end_time);
															
 
																+			
															
 
																+			int profiling = starpu_profiling_status_get();
															
 
																+			if (profiling)
															
 
																+			{
															
 
																+				struct timespec sleeping_time;
															
 
																+				starpu_timespec_sub(&end_time, &start_time, &sleeping_time);
															
 
																+				_starpu_worker_update_profiling_info_sleeping(workerid, &start_time, &end_time);
															
 
																+			}
															
 
																+			idle = 0;
															
 
																+		}
															
 
																 		STARPU_ASSERT(task);
															
 
																 		j = _starpu_get_job_associated_to_task(task);
															
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -59,6 +59,9 @@ void starpu_worker_profiling_helper_display_summary(void)
 
																 	int workerid;
															
 
																 	int worker_cnt = starpu_worker_get_count();
															
 
																+	double all_total_time = 0.0;
															
 
																+	double all_exec_time = 0.0;
															
 
																+	double all_sleeping_time = 0.0;
															
 
																 	for (workerid = 0; workerid < worker_cnt; workerid++)
															
 
																 	{
															
 
																 		struct starpu_worker_profiling_info info;
															
@@ -72,6 +75,9 @@ void starpu_worker_profiling_helper_display_summary(void)
 
																 			double total_time = starpu_timing_timespec_to_us(&info.total_time) / 1000.;
															
 
																 			double executing_time = starpu_timing_timespec_to_us(&info.executing_time) / 1000.;
															
 
																 			double sleeping_time = starpu_timing_timespec_to_us(&info.sleeping_time) / 1000.;
															
 
																+			all_total_time+=total_time;
															
 
																+			all_exec_time += executing_time;
															
 
																+			all_sleeping_time += sleeping_time;
															
 
																 			if (total_time > overall_time)
															
 
																 				overall_time = total_time;
															
@@ -89,6 +95,8 @@ void starpu_worker_profiling_helper_display_summary(void)
 
																 		sum_consumed += info.power_consumed;
															
 
																 	}
															
 
																+	fprintf(stderr, "\t total: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n", all_total_time, all_exec_time, all_sleeping_time);
															
 
																+	fprintf(stderr, "\t total: %.2lf ms executing: %.2lf ms sleeping: %.2lf\n", all_total_time, (all_exec_time/all_total_time)*100, (all_sleeping_time/all_exec_time)*100);
															
 
																 	if (profiling)
															
 
																 	{