Quellcode durchsuchen

hierarchical contexts - bug fixing and DEBUG macro

Andra Hugo vor 11 Jahren
Ursprung
Commit
5f388d2a95

+ 14 - 0
configure.ac

@@ -330,6 +330,20 @@ fi
 AM_CONDITIONAL([STARPU_BUILD_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_SC_HYPERVISOR], [test "x$build_sc_hypervisor" = "xyes"])
 
+AC_ARG_ENABLE([sc_hypervisor_debug],
+  [AS_HELP_STRING([--enable-sc-hypervisor-debug],
+    [enable debug for resizing contexts (experimental)])],
+  [enable_sc_hypervisor_debug="yes"],
+  [enable_sc_hypervisor_debug="no"])
+
+
+AC_SUBST(STARPU_SC_HYPERVISOR_DEBUG, $enable_sc_hypervisor_debug)
+AM_CONDITIONAL([STARPU_SC_HYPERVISOR_DEBUG], [test "x$enable_sc_hypervisor_debug" = "xyes"])
+
+if test "x$enable_sc_hypervisor_debug" = "xyes"; then
+  AC_DEFINE(STARPU_SC_HYPERVISOR_DEBUG, [1], [enable debug sc_hypervisor])
+fi
+
 ###############################################################################
 #                                                                             #
 #                                 CPUs settings                               #

+ 1 - 0
include/starpu_config.h.in

@@ -80,6 +80,7 @@
 #undef STARPU_MAXIMPLEMENTATIONS
 #undef STARPU_MAXMPKERNELS
 #undef STARPU_USE_SC_HYPERVISOR
+#undef STARPU_SC_HYPERVISOR_DEBUG
 #undef STARPU_HAVE_GLPK_H
 
 #undef STARPU_HAVE_LIBNUMA

+ 3 - 0
include/starpu_sched_ctx.h

@@ -113,6 +113,9 @@ double starpu_sched_ctx_get_nready_flops(unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
 
+void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
+
+unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
 #ifdef STARPU_USE_SC_HYPERVISOR
 void starpu_sched_ctx_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SC_HYPERVISOR

+ 1 - 1
include/starpu_worker.h

@@ -57,7 +57,7 @@ struct starpu_worker_collection
 {
 	void *workerids;
 	unsigned nworkers;
-	unsigned present[STARPU_NMAXWORKERS];
+	int present[STARPU_NMAXWORKERS];
 	enum starpu_worker_collection_type type;
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	int (*get_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);

+ 6 - 1
sc_hypervisor/include/sc_hypervisor.h

@@ -129,7 +129,7 @@ void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total
 void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_task_flops);
 
 /* updates the min and max workers needed by each context */
-void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs);
+void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs, int max_nworkers);
 
 /* returns a list of contexts that are on the same level in the hierarchy of contexts */
 void sc_hypervisor_get_ctxs_on_level(unsigned **sched_ctxs, int *nsched_ctxs, unsigned hierarchy_level, unsigned father_sched_ctx_id);
@@ -137,8 +137,13 @@ void sc_hypervisor_get_ctxs_on_level(unsigned **sched_ctxs, int *nsched_ctxs, un
 /* returns the number of levels of ctxs registered to the hyp */
 unsigned sc_hypervisor_get_nhierarchy_levels(void);
 
+/* return the leaves ctxs from the list of ctxs */
+void sc_hypervisor_get_leaves(unsigned *sched_ctxs, int nsched_ctxs, unsigned *leaves, int *nleaves);
+
 /* returns the nready flops of all ctxs below in hierachy of sched_ctx */
 double sc_hypervisor_get_nready_flops_of_all_sons_of_sched_ctx(unsigned sched_ctx);
+
+void sc_hypervisor_print_overhead();
 #ifdef __cplusplus
 }
 #endif

+ 3 - 0
sc_hypervisor/include/sc_hypervisor_lp.h

@@ -50,6 +50,9 @@ void sc_hypervisor_lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rou
 /* make the first distribution of ressource in contexts by assigning the first x available ressources to each one */
 void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, int res_rounded[ns][nw], double res[ns][nw], int *workers, int nworkers, struct types_of_workers *tw);
 
+/* make the first distribution of ressource in contexts by assigning the first x available ressources to each one, share not integer no of workers */
+void sc_hypervisor_lp_distribute_floating_no_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, double res[ns][nw], int *workers, int nworkers, struct types_of_workers *tw);
+
 /* place resources in contexts dependig on whether they already have workers or not */
 void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], unsigned *sched_ctxs, int *workers, unsigned do_size, struct types_of_workers *tw);
 

+ 4 - 0
sc_hypervisor/include/sc_hypervisor_monitoring.h

@@ -132,6 +132,10 @@ struct sc_hypervisor_wrapper
 	   worker to the idle of the context or just half*/
 	unsigned compute_partial_idle[STARPU_NMAXWORKERS];
 
+	/* consider the max in the lp */
+	unsigned consider_max;
+
+
 };
 
 /* return the wrapper of context that saves its monitoring information */

+ 6 - 0
sc_hypervisor/include/sc_hypervisor_policy.h

@@ -99,6 +99,12 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 /* compute the speed of a type of worker in a context depending on its history */ 
 double sc_hypervisor_get_ref_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_w, enum starpu_worker_archtype arch);
 
+/* compute the average speed of a type of worker in all ctxs from the begining of appl */
+double sc_hypervisor_get_avg_speed(enum starpu_worker_archtype arch);
+
+/* verify if we need to consider the max in the lp */
+void sc_hypervisor_check_if_consider_max(struct types_of_workers *tw);
+
 /* get the list of workers grouped by type */
 void sc_hypervisor_group_workers_by_type(struct types_of_workers *tw, int *total_nw);
 

+ 18 - 22
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -26,8 +26,9 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 {
 	/* for vite */
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 	printf("resize_no = %d %d ctxs\n", resize_no, ns);
-
+#endif
 	if(ns <= 0) return;
 
 	unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;
@@ -54,14 +55,20 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 	
 	__attribute__((unused))	float timing = (float)(diff_s*1000000 + diff_us)/1000;
 	
-	if(vmax != 0.0)
+	if(vmax != -1.0)
 	{
-		int nworkers_per_ctx_rounded[ns][nw];
-		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_ctx, nworkers_per_ctx_rounded);
-//		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw);
-		sc_hypervisor_lp_distribute_resources_in_ctxs(curr_sched_ctxs, ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, workers, curr_nworkers, tw);
+/* 		int nworkers_per_ctx_rounded[ns][nw]; */
+/* 		sc_hypervisor_lp_round_double_to_int(ns, nw, nworkers_per_ctx, nworkers_per_ctx_rounded); */
+/* //		sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, curr_sched_ctxs, tw); */
+/* 		sc_hypervisor_lp_distribute_resources_in_ctxs(curr_sched_ctxs, ns, nw, nworkers_per_ctx_rounded, nworkers_per_ctx, workers, curr_nworkers, tw); */
+		sc_hypervisor_lp_distribute_floating_no_resources_in_ctxs(curr_sched_ctxs, ns, nw, nworkers_per_ctx, workers, curr_nworkers, tw);
+
 		sc_hypervisor_lp_share_remaining_resources(ns, curr_sched_ctxs, curr_nworkers, workers);
 	}
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+	printf("*****finished resize \n");
+#endif
+	return;
 }
 
 static void _try_resizing_hierarchically(unsigned levels, unsigned current_level, unsigned *sched_ctxs, unsigned nsched_ctxs, int *pus, int npus)
@@ -117,7 +124,9 @@ static int _get_first_level(unsigned *sched_ctxs, int nsched_ctxs, unsigned *fir
 
 static void _resize(unsigned *sched_ctxs, int nsched_ctxs, int *workers, int nworkers)
 {
+#ifdef STARPU_USE_FXT
 	starpu_fxt_trace_user_event(resize_no);
+#endif
 	unsigned nhierarchy_levels = sc_hypervisor_get_nhierarchy_levels();
 	if(nhierarchy_levels > 1)
 	{
@@ -270,7 +279,9 @@ static void feft_lp_size_ctxs(unsigned *sched_ctxs, int nsched_ctxs, int *worker
 	}
 
 	_resize(sched_ctxs, nsched_ctxs, workers, nworkers);
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 	printf("finished size ctxs\n");
+#endif
 	starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
 }
 
@@ -296,22 +307,7 @@ static void _resize_leaves(int worker)
 
 	unsigned leaves[nsched_ctxs];
 	unsigned nleaves = 0;
-	for(s = 0; s < nworkers_sched_ctxs; s++)
-	{
-		unsigned is_someones_father = 0;
-		for(s2 = 0; s2 < nworkers_sched_ctxs; s2++)
-		{
-			unsigned father = starpu_sched_ctx_get_inheritor(workers_sched_ctxs[s2]);
-			if(workers_sched_ctxs[s] == father)
-			{
-				is_someones_father = 1;
-				break;
-			}
-		}
-		if(!is_someones_father)
-			leaves[nleaves++] = workers_sched_ctxs[s];
-	}
-
+	sc_hypervisor_get_leaves(workers_sched_ctxs, nworkers_sched_ctxs, leaves, &nleaves);
 	for(s = 0; s < nleaves; s++)
 		_resize_if_speed_diff(leaves[s], worker);
 }

+ 0 - 1
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -110,7 +110,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	starpu_fxt_trace_user_event(2);
         int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int nw = nworkers == -1 ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
         unsigned *curr_sched_ctxs = sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : sched_ctxs;

+ 0 - 1
sc_hypervisor/src/hypervisor_policies/teft_lp_policy.c

@@ -166,7 +166,6 @@ static void teft_lp_handle_submitted_job(struct starpu_codelet *cl, unsigned sch
 
 static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs , int *workers, int nworkers)
 {
-	starpu_fxt_trace_user_event(2);
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int nw = workers == NULL ? (int)starpu_worker_get_count() : nworkers; /* Number of different workers */
 

+ 43 - 10
sc_hypervisor/src/policies_utils/lp_programs.c

@@ -252,7 +252,7 @@ double sc_hypervisor_lp_simulate_distrib_tasks(int ns, int nw, int nt, double w_
 double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw], double flops[ns], double res[ns][nw], 
 					       int  total_nw[nw], unsigned sched_ctxs[ns], double last_vmax)
 {
-	int integer = 1;
+	int integer = 0;
 	int s, w;
 	glp_prob *lp;
 
@@ -272,11 +272,13 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 	   and another column corresponding to the 1/tmax bound (bc 1/tmax is a variable too)*/
 	glp_add_cols(lp, nw*ns+1);
 
+	struct sc_hypervisor_wrapper *sc_w = NULL;
 	for(s = 0; s < ns; s++)
 	{
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
 		for(w = 0; w < nw; w++)
 		{
-			struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
 			char name[32];
 			snprintf(name, sizeof(name), "worker%dctx%d", w, s);
 			glp_set_col_name(lp, n, name);
@@ -284,22 +286,50 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
 			if (integer)
 			{
 				glp_set_col_kind(lp, n, GLP_IV);
-				if(config->max_nworkers == 0)
-					glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers, config->max_nworkers);
+				if(sc_w->consider_max)
+				{
+					if(config->max_nworkers == 0)
+						glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers, config->max_nworkers);
+					else
+						glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers, config->max_nworkers);
+				}
 				else
-					glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers, config->max_nworkers);
+				{
+					if(total_nw[w] == 0)
+						glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers, total_nw[w]);
+					else
+						glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers, total_nw[w]);
+				}
 			}
 			else
 			{
-				if(config->max_nworkers == 0)
-					glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers*1.0, config->max_nworkers*1.0);
+				if(sc_w->consider_max)
+				{
+					if(config->max_nworkers == 0)
+						glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers*1.0, config->max_nworkers*1.0);
+					else
+						glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers*1.0, config->max_nworkers*1.0);
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+					printf("%d****************consider max %lf in lp\n", sched_ctxs[s], config->max_nworkers*1.0);
+#endif
+				}
 				else
-					glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers*1.0, config->max_nworkers*1.0);
+				{
+					if(total_nw[w] == 0)
+						glp_set_col_bnds(lp, n, GLP_FX, config->min_nworkers*1.0, total_nw[w]*1.0);
+					else
+						glp_set_col_bnds(lp, n, GLP_DB, config->min_nworkers*1.0, total_nw[w]*1.0);
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+					printf("%d****************don't consider max %d but total %d in lp\n", sched_ctxs[s], config->max_nworkers, total_nw[w]);
+#endif
+				}
 			}
 			n++;
 		}
 	}
-
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+	printf("ns = %d nw = %d\n", ns, nw);
+#endif
 	/*1/tmax should belong to the interval [0.0;1.0]*/
 	glp_set_col_name(lp, n, "vmax");
 //	glp_set_col_bnds(lp, n, GLP_DB, 0.0, 1.0);
@@ -456,8 +486,9 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
         }
 
 	double vmax = glp_get_obj_val(lp);
-
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 	printf("vmax = %lf \n", vmax);
+#endif
 	n = 1;
 	for(s = 0; s < ns; s++)
 	{
@@ -467,7 +498,9 @@ double sc_hypervisor_lp_simulate_distrib_flops(int ns, int nw, double v[ns][nw],
                                 res[s][w] = (double)glp_mip_col_val(lp, n);
 			else
 				res[s][w] = glp_get_col_prim(lp, n);
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
   			printf("%d/%d: res %lf flops = %lf v = %lf\n", w,s, res[s][w], flops[s], v[s][w]);
+#endif
 			n++;
 		}
 	}

+ 186 - 30
sc_hypervisor/src/policies_utils/lp_tools.c

@@ -28,9 +28,9 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 	double v[nsched_ctxs][ntypes_of_workers];
 	double flops[nsched_ctxs];
 	
-	unsigned nhierarchy_levels = sc_hypervisor_get_nhierarchy_levels();
-	if(nhierarchy_levels <= 1)
-		sc_hypervisor_update_resize_interval(sched_ctxs, nsched_ctxs);
+/* 	unsigned nhierarchy_levels = sc_hypervisor_get_nhierarchy_levels(); */
+/* 	if(nhierarchy_levels <= 1) */
+	sc_hypervisor_update_resize_interval(sched_ctxs, nsched_ctxs, total_nw[0]);
 
 	int nw = tw->nw;
 	int i = 0;
@@ -41,7 +41,7 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
 		int w;
 		for(w = 0; w < nw; w++)
-			v[i][w] = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
+			v[i][w] = 5.0;//sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
 
 		double ready_flops = starpu_sched_ctx_get_nready_flops(sc_w->sched_ctx);
 		unsigned nhierarchy_levels = sc_hypervisor_get_nhierarchy_levels();
@@ -72,10 +72,57 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 		}
 		if(flops[i] < 0.0)
 			flops[i] = 0.0;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 		printf("%d: flops %lf remaining flops %lf ready flops %lf nready_tasks %d\n",
 		       sched_ctxs[i], flops[i], sc_w->remaining_flops/1000000000, ready_flops/1000000000, nready_tasks);
+#endif
 
 	}
+	sc_hypervisor_check_if_consider_max(tw);
+	int w;
+	for(w = 0; w < nw; w++)
+	{
+		double avg_speed = sc_hypervisor_get_avg_speed(sc_hypervisor_get_arch_for_index(w, tw));
+		if(avg_speed != -1.0)
+		{
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+			printf("avg_speed for cpus is %lf \n", avg_speed);
+#endif
+			unsigned consider_max_for_all = 0;
+			for(i = 0; i < nsched_ctxs; i++)
+			{
+				sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+				
+				if(!sc_w->consider_max)
+				{
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+					printf("ctx%: current speed is %lf and compare speed is min %lf max %lf\n", sched_ctxs[i], v[i][w], (0.1*avg_speed), (2*avg_speed));
+#endif
+					if(v[i][w] < 0.1*avg_speed || v[i][w] > 2*avg_speed)
+					{
+						sc_w->consider_max = 1;
+						consider_max_for_all = 1;
+					}
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+					printf("ctx %d consider max %d \n", sched_ctxs[i], sc_w->consider_max);
+#endif
+				}
+
+			}
+			if(consider_max_for_all)
+			{
+				for(i = 0; i < nsched_ctxs; i++)
+				{
+					sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+					sc_w->consider_max = 1;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+					printf("ctx %d consider max %d anyway \n", sched_ctxs[i], sc_w->consider_max);
+#endif
+				}
+			}
+
+		}
+	}
 
 	if(nsched_ctxs == 1)
 	{
@@ -101,7 +148,8 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 	for(i = 0; i < nsched_ctxs; i++)
 	{
 		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[i]);
-		if(config->max_nworkers != 0)
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+		if(config->max_nworkers != 0 || !sc_w->consider_max)
 		{
 			tmp_sched_ctxs[tmp_nsched_ctxs] = sched_ctxs[i];
 			tmp_flops[tmp_nsched_ctxs] = flops[i];
@@ -111,7 +159,8 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 			tmp_nsched_ctxs++;
 		}
 	}
-	
+	if(tmp_nsched_ctxs == 0)
+		return -1.0;
 	double ret = sc_hypervisor_lp_simulate_distrib_flops(tmp_nsched_ctxs, ntypes_of_workers, tmp_v, tmp_flops, tmp_res, total_nw, tmp_sched_ctxs, -1.0);
 
 	int j;
@@ -203,31 +252,47 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
 
 	}
 
+	/* if the lp could not give any workers to any context 
+	   just split the workers btw the contexts */
+	if(ret == 0.0)
+	{
+		double rand_res[nw];
+		int w;
+		for(w = 0; w < nw; w++)
+			rand_res[w] = total_nw[w]/nsched_ctxs;
+		int s;
+		for(s = 0; s < nsched_ctxs; s++)
+			for(w = 0; w < nw; w++)
+				res[s][w] = rand_res[w];
+	}
+
+	else
 	/* keep the first speed */
-	if(ret != 0.0)
+//	if(ret != 0.0)
 	{
 		vmax = 1 / ret;
-		double optimal_v = 0.0;
-		for(i = 0; i < nsched_ctxs; i++)
-		{
+	}
+	double optimal_v = 0.0;
+	for(i = 0; i < nsched_ctxs; i++)
+	{
 #ifdef STARPU_USE_CUDA
-			optimal_v = res[i][0] * v[i][0] + res[i][1]* v[i][1];
+		optimal_v = res[i][0] * v[i][0] + res[i][1]* v[i][1];
 #else
-			optimal_v = res[i][0] * v[i][0];
+		optimal_v = res[i][0] * v[i][0];
 #endif //STARPU_USE_CUDA
-			int w;
-			unsigned no_workers = 1;
-			for(w = 0; w < nw; w++)
+		int w;
+		unsigned no_workers = 1;
+		for(w = 0; w < nw; w++)
+		{
+			if(res[i][w] != 0.0)
 			{
-				if(res[i][w] != 0.0)
-				{
-					no_workers = 0;
-					break;
-				}
+				no_workers = 0;
+				break;
 			}
-			
-			sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
-			
+		}
+		
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[i]);
+		
 /* if the hypervisor gave 0 workers to a context but the context still 
    has some last flops or a ready task that does not even have any flops
    we give a worker (in shared mode) to the context in order to leave him
@@ -235,15 +300,14 @@ double sc_hypervisor_lp_get_nworkers_per_ctx(int nsched_ctxs, int ntypes_of_work
    the distribution function we take this into account and revert the variable
    to its 0.0 value */ 
 //		if(no_workers && (flops[i] != 0.0 || sc_w->nready_tasks > 0))
-			if(no_workers)
-			{
-				for(w = 0; w < nw; w++)
-					res[i][w] = -1.0;
-			}
+		if(no_workers)
+		{
+			for(w = 0; w < nw; w++)
+				res[i][w] = -1.0;
+		}
 			
 //			if(optimal_v != 0.0)
-				_set_optimal_v(sched_ctxs[i], optimal_v);
-		}
+		_set_optimal_v(sched_ctxs[i], optimal_v);
 	}
 
 	return vmax;
@@ -680,6 +744,98 @@ void sc_hypervisor_lp_distribute_resources_in_ctxs(unsigned* sched_ctxs, int ns,
 	}
 }
 
+void sc_hypervisor_lp_distribute_floating_no_resources_in_ctxs(unsigned* sched_ctxs, int ns, int nw, double res[ns][nw], 
+							       int *workers, int nworkers, struct types_of_workers *tw)
+{
+	int s, w;
+	int start[nw];
+	for(w = 0; w < nw; w++)
+		start[w] = 0;
+	for(s = 0; s < ns; s++)
+	{
+		int workers_add[STARPU_NMAXWORKERS];
+                int nw_add = 0;
+		double target_res = 0.0;
+		for(w = 0; w < nw; w++)
+		{
+			target_res += res[s][w];
+			if(res[s][w] == -1.0) res[s][w] = 0.0;
+		}
+
+		for(w = 0; w < nw; w++)
+		{
+			enum starpu_worker_archtype arch = sc_hypervisor_get_arch_for_index(w, tw);
+			
+			if(arch == STARPU_CPU_WORKER) 
+			{
+				int nworkers_to_add = ceil(res[s][w]);
+				double ceil_double = (double)nworkers_to_add;
+				double diff = ceil_double - res[s][w];
+
+				if(target_res < 0.0)
+				{
+					nworkers_to_add=1;
+					int old_start = start[w];
+					if(start[w] != 0)
+						start[w]--;
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
+					start[w] = old_start;
+					int i;
+					for(i = 0; i < nworkers_to_add; i++)
+					{
+						workers_add[nw_add++] = workers_to_add[i];
+					}
+					free(workers_to_add);
+				}
+				else
+				{
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &nworkers_to_add, arch);
+					int i;
+					for(i = 0; i < nworkers_to_add; i++)
+						workers_add[nw_add++] = workers_to_add[i];
+					free(workers_to_add);
+				}
+				if(diff != 0.0)
+					start[w]--;
+			}
+			else
+			{
+				double nworkers_to_add = res[s][w];
+				int x = floor(nworkers_to_add);
+				double x_double = (double)x;
+				double diff = nworkers_to_add - x_double;
+				if(diff == 0.0)
+				{
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &x, arch);
+					int i;
+					for(i = 0; i < x; i++)
+						workers_add[nw_add++] = workers_to_add[i];
+					free(workers_to_add);
+				}
+				else
+				{
+					x+=1;
+					int *workers_to_add = sc_hypervisor_get_idlest_workers_in_list(&start[w], workers, nworkers, &x, arch);
+					int i;
+					if(diff >= 0.3)
+						for(i = 0; i < x; i++)
+							workers_add[nw_add++] = workers_to_add[i];
+					else
+						for(i = 0; i < x-1; i++)
+							workers_add[nw_add++] = workers_to_add[i];
+					
+					free(workers_to_add);
+				}
+			}
+		}
+//		sc_hypervisor_start_resize(sched_ctxs[s]);
+		sc_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s]);
+		int workers_remove[STARPU_NMAXWORKERS];
+		int nw_remove = _lp_get_unwanted_workers(workers_add, nw_add, sched_ctxs[s], workers_remove);
+		sc_hypervisor_remove_workers_from_sched_ctx(workers_remove, nw_remove, sched_ctxs[s], !(_sc_hypervisor_use_lazy_resize()));
+	}
+}
+
 /* nw = all the workers (either in a list or on all machine) */
 void sc_hypervisor_lp_place_resources_in_ctx(int ns, int nw, double w_in_s[ns][nw], unsigned *sched_ctxs_input, int *workers_input, unsigned do_size, struct types_of_workers *tw)
 {

+ 1 - 1
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -511,7 +511,7 @@ unsigned sc_hypervisor_check_speed_gap_btw_ctxs(unsigned *sched_ctxs_in, int ns_
 		double vmax = sc_hypervisor_lp_get_nworkers_per_ctx(ns, nw, nworkers_per_ctx, total_nw, tw, sched_ctxs);
 
 		
-		if(vmax != 0.0)
+//		if(vmax != 0.0)
 		{
 			for(i = 0; i < ns; i++)
 			{

+ 120 - 0
sc_hypervisor/src/policies_utils/speed.c

@@ -140,6 +140,17 @@ double sc_hypervisor_get_speed_per_worker_type(struct sc_hypervisor_wrapper* sc_
 			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
 			if(arch == req_arch && sc_w->compute_idle[worker])
 			{
+				if(sc_w->exec_start_time[worker] != 0.0)
+				{
+					double current_exec_time = 0.0;
+					if(sc_w->exec_start_time[worker] < sc_w->start_time)
+						current_exec_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */ 
+					else
+						current_exec_time = (curr_time - sc_w->exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+					double suppl_flops = current_exec_time * sc_hypervisor_get_ref_speed_per_worker_type(sc_w, req_arch);
+					all_workers_flops += suppl_flops;
+				}		
+
 				all_workers_flops += sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
 				if(max_workers_idle_time < sc_w->idle_time[worker])
 					max_workers_idle_time = sc_w->idle_time[worker]; /* in seconds */
@@ -201,3 +212,112 @@ double sc_hypervisor_get_speed(struct sc_hypervisor_wrapper *sc_w, enum starpu_w
 
 	return speed;
 }
+
+double sc_hypervisor_get_avg_speed(enum starpu_worker_archtype arch)
+{
+	double total_executed_flops = 0.0;
+	double total_estimated_flops = 0.0;
+	struct sc_hypervisor_wrapper *sc_w;
+	double max_real_start_time = 0.0;
+	int s;
+	unsigned nworkers =  starpu_worker_get_count_by_type(arch);
+
+	unsigned *sched_ctxs;
+	int nsched_ctxs;
+	sc_hypervisor_get_ctxs_on_level(&sched_ctxs, &nsched_ctxs, 0, STARPU_NMAX_SCHED_CTXS);
+	
+	for(s = 0; s < nsched_ctxs; s++)
+	{
+		sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+		struct starpu_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sched_ctxs[s]);
+		int worker;
+		
+		struct starpu_sched_ctx_iterator it;
+		if(workers->init_iterator)
+			workers->init_iterator(workers, &it);
+
+		while(workers->has_next(workers, &it))
+		{
+			worker = workers->get_next(workers, &it);
+			enum starpu_worker_archtype req_arch = starpu_worker_get_type(worker);
+			if(arch == req_arch)
+			{
+				total_executed_flops += sc_w->total_elapsed_flops[worker] / 1000000000.0; /*in gflops */;
+			}
+		}
+
+		total_estimated_flops += sc_w->total_flops / 1000000000.0; /*in gflops */;;
+
+		if(max_real_start_time < sc_w->real_start_time)
+			max_real_start_time = sc_w->real_start_time;
+	}
+	double speed = -1.0;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+	printf("total_exec_flops %lf total_estimated_flops %lf max_real_start_time %lf nworkers %d \n", total_executed_flops, total_estimated_flops, max_real_start_time, nworkers);
+#endif
+	if(total_executed_flops > 0.5*total_estimated_flops)
+	{
+		double curr_time = starpu_timing_now();
+		double time = (curr_time - max_real_start_time) / 1000000.0; /* in seconds */
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+		printf("time = %lf\n", time);
+#endif
+		speed = (total_executed_flops / time) / nworkers; 
+	}
+
+	return speed;
+}
+
+void _consider_max_for_children(unsigned sched_ctx, unsigned consider_max)
+{
+	struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctx);
+	sc_w->consider_max = consider_max;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+	printf("ctx %d consider max %d \n", sched_ctx, sc_w->consider_max); 
+#endif
+
+	int level = starpu_sched_ctx_get_hierarchy_level(sched_ctx);
+	unsigned *sched_ctxs_child;
+	int nsched_ctxs_child = 0;
+	sc_hypervisor_get_ctxs_on_level(&sched_ctxs_child, &nsched_ctxs_child, level+1, sched_ctx);
+	int s;
+	for(s = 0; s < nsched_ctxs_child; s++)
+		_consider_max_for_children(sched_ctxs_child[s], consider_max);
+	if(nsched_ctxs_child > 0)
+		free(sched_ctxs_child);
+	return;
+}
+
+void sc_hypervisor_check_if_consider_max(struct types_of_workers *tw)
+{
+	unsigned *sched_ctxs;
+	int nsched_ctxs;
+	sc_hypervisor_get_ctxs_on_level(&sched_ctxs, &nsched_ctxs, 0, STARPU_NMAX_SCHED_CTXS);
+
+	int nw = tw->nw;
+	double avg_speed_per_tw[nw];
+	int w;
+	for(w = 0; w < nw; w++)
+	{
+		avg_speed_per_tw[w] = sc_hypervisor_get_avg_speed(sc_hypervisor_get_arch_for_index(w, tw));
+		if(avg_speed_per_tw[w] == -1.0)
+			return;
+	}
+
+	int s;
+	for(s = 0; s < nsched_ctxs; s++)
+	{
+		for(w = 0; w < nw; w++)
+		{
+			struct sc_hypervisor_wrapper *sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
+			double speed = sc_hypervisor_get_speed(sc_w, sc_hypervisor_get_arch_for_index(w, tw)); 
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+			printf("%d: speed %lf avg_speed %lf min %lf max %lf\n", sched_ctxs[s], speed, avg_speed_per_tw[w], (avg_speed_per_tw[w]*0.5), (avg_speed_per_tw[w]*1.5));
+#endif
+			if(speed < avg_speed_per_tw[w]*0.5 || speed > avg_speed_per_tw[w]*1.5)
+				_consider_max_for_children(sched_ctxs[s], 1);
+			else
+				_consider_max_for_children(sched_ctxs[s], 0);
+		}
+	}
+}

+ 296 - 53
sc_hypervisor/src/sc_hypervisor.c

@@ -18,6 +18,7 @@
 #include <sc_hypervisor_policy.h>
 #include <starpu_config.h>
 
+double hyp_overhead = 0.0;
 unsigned imposed_resize = 0;
 unsigned type_of_tasks_known = 0;
 struct starpu_sched_ctx_performance_counters* perf_counters = NULL;
@@ -164,6 +165,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 
 	starpu_pthread_mutex_init(&act_hypervisor_mutex, NULL);
 	hypervisor.start_executing_time = starpu_timing_now();
+
 	int i;
 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
 	{
@@ -192,6 +194,7 @@ void* sc_hypervisor_init(struct sc_hypervisor_policy *hypervisor_policy)
 		hypervisor.sched_ctx_w[i].ref_speed[1] = -1.0;
 		hypervisor.sched_ctx_w[i].total_flops_available = 0;
 		hypervisor.sched_ctx_w[i].to_be_sized = 0;
+		hypervisor.sched_ctx_w[i].consider_max = 0;
 		int j;
 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
 		{
@@ -296,6 +299,22 @@ void sc_hypervisor_shutdown(void)
 	perf_counters = NULL;
 
 	starpu_pthread_mutex_destroy(&act_hypervisor_mutex);
+
+}
+
+void sc_hypervisor_print_overhead()
+{
+//	hyp_overhead /= 1000000.0;*
+	FILE *f;
+	const char *sched_env = getenv("OVERHEAD_FILE");
+	if(!sched_env)
+		f = fopen("overhead_microsec", "a");
+	else
+		f = fopen(sched_env, "a");
+	fprintf(f, "%lf \n", hyp_overhead);
+	fclose(f);
+
+
 }
 
 /* the hypervisor is in charge only of the contexts registered to it*/
@@ -352,18 +371,21 @@ static void _rearange_sched_ctxs(unsigned *sched_ctxs, int old_nsched_ctxs)
 /* unregistered contexts will no longer be resized */
 void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 {
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 	printf("unregister ctx %d with remaining flops %lf \n", hypervisor.sched_ctx_w[sched_ctx].sched_ctx, hypervisor.sched_ctx_w[sched_ctx].remaining_flops);
+#endif
 	if(hypervisor.policy.end_ctx)
 		hypervisor.policy.end_ctx(sched_ctx);
 
+	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	unsigned father = starpu_sched_ctx_get_inheritor(sched_ctx);
 	int *pus;
 	unsigned npus = starpu_sched_ctx_get_workers_list(sched_ctx, &pus);
 
 	starpu_sched_ctx_set_priority(pus, npus, father, 1);
+	starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
 	free(pus);
 
-	starpu_pthread_mutex_lock(&act_hypervisor_mutex);
 	unsigned i;
 	for(i = 0; i < hypervisor.nsched_ctxs; i++)
 	{
@@ -506,7 +528,6 @@ static void _decrement_elapsed_flops_per_worker(unsigned sched_ctx, int worker,
 
 	return;
 }
-
 void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
 {
 	double start_time =  starpu_timing_now();
@@ -516,15 +537,17 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 		struct sc_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
 		
 		sender_sc_w->start_time = start_time;
+		unsigned nworkers = starpu_worker_get_count();
 		int i;
-		for(i = 0; i < STARPU_NMAXWORKERS; i++)
+ 		for(i = 0; i < nworkers; i++)
 		{
 			sender_sc_w->start_time_w[i] = start_time;
 			sender_sc_w->idle_time[i] = 0.0;
 			sender_sc_w->idle_start_time[i] = 0.0;
 			hypervisor.sched_ctx_w[sender_sched_ctx].exec_time[i] = 0.0;
-			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = (hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
+//			hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] = (hypervisor.sched_ctx_w[sender_sched_ctx].exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
 			_decrement_elapsed_flops_per_worker(sender_sched_ctx, i, hypervisor.sched_ctx_w[sender_sched_ctx].elapsed_flops[i]); 
+
 		}
 		_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
 	}
@@ -534,13 +557,15 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 		struct sc_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
 		
 		receiver_sc_w->start_time = start_time;
+
+		unsigned nworkers = starpu_worker_get_count();
 		int i;
-		for(i = 0; i < STARPU_NMAXWORKERS; i++)
+ 		for(i = 0; i < nworkers; i++)
 		{
 			receiver_sc_w->start_time_w[i] = (receiver_sc_w->start_time_w[i] != 0.0) ? starpu_timing_now() : 0.0;
 			receiver_sc_w->idle_time[i] = 0.0;
 			receiver_sc_w->idle_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? 0.0 : starpu_timing_now();
-			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
+//			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_start_time[i] = (receiver_sc_w->exec_start_time[i] != 0.0) ? starpu_timing_now() : 0.0;
 			hypervisor.sched_ctx_w[receiver_sched_ctx].exec_time[i] = 0.0;
 			_decrement_elapsed_flops_per_worker(receiver_sched_ctx, i, hypervisor.sched_ctx_w[receiver_sched_ctx].elapsed_flops[i]); 
 		}
@@ -557,22 +582,25 @@ void sc_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sch
 	{
 		_print_current_time();
 		unsigned j;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 		printf("resize ctx %d with %d workers", sender_sched_ctx, nworkers_to_move);
 		for(j = 0; j < nworkers_to_move; j++)
 			printf(" %d", workers_to_move[j]);
 		printf("\n");
-		starpu_fxt_trace_user_event(1);
+#endif
+
 		hypervisor.allow_remove[receiver_sched_ctx] = 0;
 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
 
 		if(now)
 		{
 			unsigned j;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 			printf("remove now from ctx %d:", sender_sched_ctx);
 			for(j = 0; j < nworkers_to_move; j++)
 				printf(" %d", workers_to_move[j]);
 			printf("\n");
-
+#endif
 			starpu_sched_ctx_remove_workers(workers_to_move, nworkers_to_move, sender_sched_ctx);
 			hypervisor.allow_remove[receiver_sched_ctx] = 1;
 			_reset_resize_sample_info(sender_sched_ctx, receiver_sched_ctx);
@@ -616,10 +644,12 @@ void sc_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned nworke
 	{
 		_print_current_time();
 		unsigned j;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 		printf("add to ctx %d:", sched_ctx);
 		for(j = 0; j < nworkers_to_add; j++)
 			printf(" %d", workers_to_add[j]);
 		printf("\n");
+#endif
 		starpu_sched_ctx_add_workers(workers_to_add, nworkers_to_add, sched_ctx);
 		struct sc_hypervisor_policy_config *new_config = sc_hypervisor_get_config(sched_ctx);
 		unsigned i;
@@ -647,22 +677,24 @@ void sc_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove, unsigne
 		if(now)
 		{
 			unsigned j;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 			printf("remove explicitley now from ctx %d:", sched_ctx);
 			for(j = 0; j < nworkers_to_remove; j++)
 				printf(" %d", workers_to_remove[j]);
 			printf("\n");
-			
+#endif
 			starpu_sched_ctx_remove_workers(workers_to_remove, nworkers_to_remove, sched_ctx);
 			_reset_resize_sample_info(sched_ctx, STARPU_NMAX_SCHED_CTXS);
 		}
 		else
 		{
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 			printf("try to remove from ctx %d: ", sched_ctx);
 			unsigned j;
 			for(j = 0; j < nworkers_to_remove; j++)
 				printf(" %d", workers_to_remove[j]);
 			printf("\n");
-
+#endif
 			int ret = starpu_pthread_mutex_trylock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
 			if(ret != EBUSY)
 			{
@@ -831,15 +863,115 @@ void _sc_hypervisor_allow_compute_idle(unsigned sched_ctx, int worker, unsigned
 	hypervisor.sched_ctx_w[sched_ctx].compute_idle[worker] = allow;
 }
 
-void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
+
+int _update_max_hierarchically(unsigned *sched_ctxs, int nsched_ctxs)
+{
+	int s, i;
+	unsigned leaves[hypervisor.nsched_ctxs];
+	int nleaves = 0;
+	sc_hypervisor_get_leaves(hypervisor.sched_ctxs, hypervisor.nsched_ctxs, leaves, &nleaves);
+
+	int max = 0;
+
+	for(s = 0; s < nsched_ctxs; s++)
+	{
+		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
+		unsigned found = 0;
+		int l = 0;
+		for(l = 0; l < nleaves; l++)
+		{
+			if(leaves[l] == sched_ctxs[s])
+			{
+				found = 1;
+				break;
+			}
+		}
+		if(!found)
+		{
+			config->max_nworkers = 0;
+			int level = starpu_sched_ctx_get_hierarchy_level(sched_ctxs[s]);
+			unsigned *sched_ctxs_child;
+			int nsched_ctxs_child = 0;
+			sc_hypervisor_get_ctxs_on_level(&sched_ctxs_child, &nsched_ctxs_child, level+1, sched_ctxs[s]);
+			if(nsched_ctxs_child > 0)
+			{			
+				config->max_nworkers += _update_max_hierarchically(sched_ctxs_child, nsched_ctxs_child);
+				free(sched_ctxs_child);
+				int max_possible_workers = starpu_worker_get_count();
+				if(config->max_nworkers < 0)
+					config->max_nworkers = 0;
+				if(config->max_nworkers > max_possible_workers)
+					config->max_nworkers = max_possible_workers;
+			
+			}
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+			printf("ctx %d has max %d \n", sched_ctxs[s], config->max_nworkers);
+#endif
+		}
+		max += config->max_nworkers;
+	}
+	return max;
+}
+void _update_max_diff_hierarchically(unsigned father, double diff)
+{
+	int level = starpu_sched_ctx_get_hierarchy_level(father);
+	unsigned *sched_ctxs_child;
+	int nsched_ctxs_child = 0;
+	sc_hypervisor_get_ctxs_on_level(&sched_ctxs_child, &nsched_ctxs_child, level+1, father);
+	if(nsched_ctxs_child > 0)
+	{
+		int s;
+		double total_nflops = 0.0;
+		for(s = 0; s < nsched_ctxs_child; s++)
+		{
+			total_nflops += hypervisor.sched_ctx_w[sched_ctxs_child[s]].remaining_flops < 0.0 ? 0.0 : hypervisor.sched_ctx_w[sched_ctxs_child[s]].remaining_flops;
+		}
+
+		int accumulated_diff = 0;
+		for(s = 0; s < nsched_ctxs_child; s++)
+		{
+			struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs_child[s]);
+			double remaining_flops = hypervisor.sched_ctx_w[sched_ctxs_child[s]].remaining_flops < 0.0 ? 0.0 : hypervisor.sched_ctx_w[sched_ctxs_child[s]].remaining_flops;
+ 			int current_diff = total_nflops == 0.0 ? 0.0 : floor((remaining_flops / total_nflops) * diff);
+			accumulated_diff += current_diff;
+			if(s == (nsched_ctxs_child - 1) && accumulated_diff < diff)
+				current_diff += (diff - accumulated_diff);
+			config->max_nworkers += current_diff;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+			printf("%d: redib max_nworkers incr %d diff = %d \n",  sched_ctxs_child[s], config->max_nworkers, current_diff);
+#endif
+			_update_max_diff_hierarchically(sched_ctxs_child[s], current_diff);
+		}
+	}
+	return;
+}
+
+void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs, int max_workers)
 {
+	unsigned leaves[hypervisor.nsched_ctxs];
+	unsigned nleaves = 0;
+	sc_hypervisor_get_leaves(hypervisor.sched_ctxs, hypervisor.nsched_ctxs, leaves, &nleaves);
+	int l;
+
 	unsigned sched_ctx;
 	int total_max_nworkers = 0;
-	int max_cpus = starpu_cpu_worker_get_count();
+//	int max_cpus = starpu_cpu_worker_get_count();
 	unsigned configured = 0;
 	int i;
 	for(i = 0; i < nsched_ctxs; i++)
 	{
+		unsigned found = 0;
+		for(l = 0; l < nleaves; l++)
+		{
+			if(leaves[l] == sched_ctxs[i])
+			{
+				found = 1;
+				break;
+			}
+		}
+		if(!found)
+			continue;
+
 		sched_ctx = sched_ctxs[i];
 
 		if(hypervisor.sched_ctx_w[sched_ctx].to_be_sized) continue;
@@ -896,7 +1028,12 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 			}
 			else
 			{
-				double current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+				double current_exec_time = 0.0;
+				if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] < hypervisor.sched_ctx_w[sched_ctx].start_time)
+					current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */ 
+				else
+					current_exec_time = (end_time - hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+
 				exec_time = hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] + current_exec_time;
 			}		
 			norm_exec_time += elapsed_time_worker[worker] == 0.0 ? 0.0 : exec_time / elapsed_time_worker[worker];
@@ -905,52 +1042,99 @@ void sc_hypervisor_update_resize_interval(unsigned *sched_ctxs, int nsched_ctxs)
 		double curr_time = starpu_timing_now();
 		double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].start_time) / 1000000.0; /* in seconds */
 		int nready_tasks = starpu_sched_ctx_get_nready_tasks(sched_ctx);
-		if(norm_idle_time >= 0.9)
-		{
-			config->max_nworkers = lrint(norm_exec_time);
-		}
-		else
-		{
-			if(norm_idle_time < 0.1)
-				config->max_nworkers = lrint(norm_exec_time)  + nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
-			else
-				config->max_nworkers = lrint(norm_exec_time);
-		}
+/* 		if(norm_idle_time >= 0.9) */
+/* 		{ */
+/* 			config->max_nworkers = lrint(norm_exec_time); */
+/* 		} */
+/* 		else */
+/* 		{ */
+/* 			if(norm_idle_time < 0.1) */
+/* 				config->max_nworkers = lrint(norm_exec_time)  + nready_tasks - 1; //workers->nworkers + hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1; */
+/* 			else */
+/* 				config->max_nworkers = lrint(norm_exec_time); */
+/* 		} */
+		config->max_nworkers = lrint(norm_exec_time);
 //		config->max_nworkers = hypervisor.sched_ctx_w[sched_ctx].nready_tasks - 1;
 		
-		if(config->max_nworkers < 0)
-			config->max_nworkers = 0;
-		if(config->max_nworkers > max_cpus)
-			config->max_nworkers = max_cpus;
+		/* if(config->max_nworkers < 0) */
+/* 			config->max_nworkers = 0; */
+/* 		if(config->max_nworkers > max_workers) */
+/* 			config->max_nworkers = max_workers; */
 		
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
 		printf("%d: ready tasks  %d norm_idle_time %lf elapsed_time %lf norm_exec_time %lf nworker %d max %d \n", 
 		       sched_ctx, nready_tasks, norm_idle_time, elapsed_time, norm_exec_time, workers->nworkers, config->max_nworkers);
-
+#endif
 
 		total_max_nworkers += config->max_nworkers;
 		configured = 1;
+		
 	}
 
-	/*if the sum of the max cpus is smaller than the total cpus available 
-	  increase the max for the ones having more ready tasks to exec */
-	if(configured && total_max_nworkers < max_cpus)
+	unsigned nhierarchy_levels = sc_hypervisor_get_nhierarchy_levels();
+	if(nhierarchy_levels > 1 && configured)
 	{
-		int diff = max_cpus - total_max_nworkers;
-		int max_nready = -1;
-		unsigned max_nready_sched_ctx = sched_ctxs[0];
-		for(i = 0; i < nsched_ctxs; i++)
+		unsigned *sched_ctxs2;
+		int nsched_ctxs2;
+		sc_hypervisor_get_ctxs_on_level(&sched_ctxs2, &nsched_ctxs2, 0, STARPU_NMAX_SCHED_CTXS);
+		
+		if(nsched_ctxs2  > 0)
 		{
-			int nready_tasks = starpu_sched_ctx_get_nready_tasks(sched_ctxs[i]);
-			if(max_nready < nready_tasks)
+			_update_max_hierarchically(sched_ctxs2, nsched_ctxs2);
+			int s;
+			int current_total_max_nworkers = 0;
+			double max_nflops = 0.0;
+			unsigned max_nflops_sched_ctx = sched_ctxs2[0];
+			for(s = 0; s < nsched_ctxs2; s++)
+			{
+				struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs2[s]);
+				current_total_max_nworkers += config->max_nworkers;
+				if(max_nflops < hypervisor.sched_ctx_w[sched_ctxs2[s]].remaining_flops)
+				{
+					max_nflops = hypervisor.sched_ctx_w[sched_ctxs2[s]].remaining_flops;
+					max_nflops_sched_ctx = sched_ctxs2[s];
+				}
+			}
+
+			int max_possible_workers = starpu_worker_get_count();
+			/*if the sum of the max cpus is smaller than the total cpus available 
+			  increase the max for the ones having more ready tasks to exec */
+			if(current_total_max_nworkers < max_possible_workers)
 			{
-				max_nready = nready_tasks;
-				max_nready_sched_ctx = sched_ctxs[i];
+				int diff = max_possible_workers - current_total_max_nworkers;
+				struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(max_nflops_sched_ctx);
+				config->max_nworkers += diff;
+#ifdef STARPU_SC_HYPERVISOR_DEBUG
+				printf("%d: redib max_nworkers incr %d \n",  max_nflops_sched_ctx, config->max_nworkers);
+#endif
+				_update_max_diff_hierarchically(max_nflops_sched_ctx, diff);
 			}
 		}
-		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(max_nready_sched_ctx);
-		config->max_nworkers += diff;
-		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers);
 	}
+
+	
+
+	/*if the sum of the max cpus is smaller than the total cpus available 
+	  increase the max for the ones having more ready tasks to exec */
+	/* if(configured && total_max_nworkers < max_workers) */
+/* 	{ */
+/* 		int diff = max_workers - total_max_nworkers; */
+/* 		int max_nready = -1; */
+/* 		unsigned max_nready_sched_ctx = sched_ctxs[0]; */
+/* 		for(i = 0; i < nsched_ctxs; i++) */
+/* 		{ */
+/* 			int nready_tasks = starpu_sched_ctx_get_nready_tasks(sched_ctxs[i]); */
+/* 			if(max_nready < nready_tasks) */
+/* 			{ */
+/* 				max_nready = nready_tasks; */
+/* 				max_nready_sched_ctx = sched_ctxs[i]; */
+/* 			} */
+/* 		} */
+/* 		struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(max_nready_sched_ctx); */
+/* 		config->max_nworkers += diff; */
+/* 		printf("%d: redib max_nworkers incr %d \n",  max_nready_sched_ctx, config->max_nworkers); */
+/* 	} */
+       
 }
 
 /* notifies the hypervisor that a new task was pushed on the queue of the worker */
@@ -961,10 +1145,9 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 		hypervisor.sched_ctx_w[sched_ctx].start_time = starpu_timing_now();
 
 	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+	{
 		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
-
-	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
-		hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
+	}
 
 	int ntasks = get_ntasks(hypervisor.sched_ctx_w[sched_ctx].pushed_tasks);
 
@@ -1018,9 +1201,16 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 
 	if(hypervisor.resize[sched_ctx] && hypervisor.policy.handle_idle_cycle)
 	{
-		if(sc_w->sched_ctx != STARPU_NMAX_SCHED_CTXS && sc_w->hyp_react_start_time != 0.0)
+		if(sc_w->hyp_react_start_time == 0.0)
+			sc_w->hyp_react_start_time = starpu_timing_now();
+		
+		double curr_time = starpu_timing_now();
+		double elapsed_time = (curr_time - sc_w->hyp_react_start_time) / 1000000.0; /* in seconds */
+		if(sc_w->sched_ctx != STARPU_NMAX_SCHED_CTXS && elapsed_time > sc_w->config->time_sample)
 		{
 			unsigned idle_everywhere = 0;
+			unsigned *sched_ctxs = NULL;
+			unsigned nsched_ctxs = 0;
 			int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
 			if(ret != EBUSY)
 			{
@@ -1028,8 +1218,7 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 				{
 					idle_everywhere = 1;
 				
-					unsigned *sched_ctxs = NULL;
-					unsigned nsched_ctxs = starpu_worker_get_sched_ctx_list(worker, &sched_ctxs);
+					nsched_ctxs = starpu_worker_get_sched_ctx_list(worker, &sched_ctxs);
 					int s;
 					for(s = 0; s < nsched_ctxs; s++)
 					{
@@ -1045,21 +1234,43 @@ static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time)
 			}
 			
 			if(idle_everywhere)
+			{
+				double hyp_overhead_start = starpu_timing_now();
 				hypervisor.policy.handle_idle_cycle(sched_ctx, worker);
+				double hyp_overhead_end = starpu_timing_now();
+				hyp_overhead += (hyp_overhead_end - hyp_overhead_start);
+			}
+
+
+			sc_w->hyp_react_start_time = starpu_timing_now();
 		}
 	}
 	return;
 }
 
+void _update_real_start_time_hierarchically(unsigned sched_ctx)
+{
+	hypervisor.sched_ctx_w[sched_ctx].real_start_time = starpu_timing_now();
+	if(starpu_sched_ctx_get_hierarchy_level(sched_ctx) > 0)
+	{
+		_update_real_start_time_hierarchically(starpu_sched_ctx_get_inheritor(sched_ctx));
+	}
+	return;
+}
 
 /* notifies the hypervisor that the worker is no longer idle and a new task was pushed on its queue */
 static void notify_poped_task(unsigned sched_ctx, int worker)
 {
+	if(hypervisor.sched_ctx_w[sched_ctx].total_flops != 0.0 && hypervisor.sched_ctx_w[sched_ctx].real_start_time == 0.0)
+		_update_real_start_time_hierarchically(sched_ctx);
+
 	if(hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] == 0.0)
+	{
 		hypervisor.sched_ctx_w[sched_ctx].start_time_w[worker] = starpu_timing_now();
+	}
 
 	hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = starpu_timing_now();
-
+		
 	if(hypervisor.sched_ctx_w[sched_ctx].idle_start_time[worker] > 0.0)
 	{
 		int ns = hypervisor.nsched_ctxs;
@@ -1089,7 +1300,6 @@ static void notify_poped_task(unsigned sched_ctx, int worker)
 				
 	if(hypervisor.policy.handle_idle_end)
 		hypervisor.policy.handle_idle_end(sched_ctx, worker);
-
 }
 
  
@@ -1120,8 +1330,9 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 	if(hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] != 0.0)
 	{
 		double current_time = starpu_timing_now();
-		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += (current_time - 
-									hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+		double exec_time = (current_time - 
+				    hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker]) / 1000000.0; /* in seconds */ 
+		hypervisor.sched_ctx_w[sched_ctx].exec_time[worker] += exec_time;
 		hypervisor.sched_ctx_w[sched_ctx].exec_start_time[worker] = 0.0;
 	}
 
@@ -1153,7 +1364,10 @@ static void notify_post_exec_task(struct starpu_task *task, size_t data_size, ui
 			double elapsed_time = (curr_time - hypervisor.sched_ctx_w[sched_ctx].hyp_react_start_time) / 1000000.0; /* in seconds */
 			if(hypervisor.sched_ctx_w[sched_ctx].sched_ctx != STARPU_NMAX_SCHED_CTXS && elapsed_time > hypervisor.sched_ctx_w[sched_ctx].config->time_sample)
 			{
+				double hyp_overhead_start = starpu_timing_now();
 				hypervisor.policy.handle_poped_task(sched_ctx, worker, task, footprint);
+				double hyp_overhead_end = starpu_timing_now();
+				hyp_overhead += (hyp_overhead_end - hyp_overhead_start);
 				hypervisor.sched_ctx_w[sched_ctx].hyp_react_start_time = starpu_timing_now();
 			}
 		}
@@ -1349,17 +1563,22 @@ struct types_of_workers* sc_hypervisor_get_types_of_workers(int *workers, unsign
 
 void sc_hypervisor_update_diff_total_flops(unsigned sched_ctx, double diff_total_flops)
 {
+//	double hyp_overhead_start = starpu_timing_now();
 	starpu_pthread_mutex_lock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
 	hypervisor.sched_ctx_w[sched_ctx].total_flops += diff_total_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops += diff_total_flops;	
 	starpu_pthread_mutex_unlock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
+/* 	double hyp_overhead_end = starpu_timing_now(); */
+/* 	hyp_overhead += (hyp_overhead_end - hyp_overhead_start); */
 	if(starpu_sched_ctx_get_hierarchy_level(sched_ctx) > 0)
 		sc_hypervisor_update_diff_total_flops(starpu_sched_ctx_get_inheritor(sched_ctx), diff_total_flops);
+	return;
 
 }
 
 void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_elapsed_flops)
 {
+//	double hyp_overhead_start = starpu_timing_now();
 	int workerid = starpu_worker_get_id();
 	if(workerid != -1)
 	{
@@ -1368,8 +1587,11 @@ void sc_hypervisor_update_diff_elapsed_flops(unsigned sched_ctx, double diff_ela
 		hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[workerid] += diff_elapsed_flops;
 //		starpu_pthread_mutex_unlock(&hypervisor.sched_ctx_w[sched_ctx].mutex);
 	}
+/* 	double hyp_overhead_end = starpu_timing_now(); */
+/* 	hyp_overhead += (hyp_overhead_end - hyp_overhead_start); */
 	if(starpu_sched_ctx_get_hierarchy_level(sched_ctx) > 0)
 		sc_hypervisor_update_diff_elapsed_flops(starpu_sched_ctx_get_inheritor(sched_ctx), diff_elapsed_flops);
+	return;
 }
 
 void sc_hypervisor_get_ctxs_on_level(unsigned **sched_ctxs, int *nsched_ctxs, unsigned hierarchy_level, unsigned father_sched_ctx_id)
@@ -1407,3 +1629,24 @@ unsigned sc_hypervisor_get_nhierarchy_levels(void)
 	}
 	return nlevels;
 }
+
+void sc_hypervisor_get_leaves(unsigned *sched_ctxs, int nsched_ctxs, unsigned *leaves, int *nleaves)
+{
+	int s, s2;
+	for(s = 0; s < nsched_ctxs; s++)
+	{
+		unsigned is_someones_father = 0;
+		for(s2 = 0; s2 < nsched_ctxs; s2++)
+		{
+			unsigned father = starpu_sched_ctx_get_inheritor(sched_ctxs[s2]);
+			if(sched_ctxs[s] == father)
+			{
+				is_someones_father = 1;
+				break;
+			}
+		}
+		if(!is_someones_father)
+			leaves[(*nleaves)++] = sched_ctxs[s];
+	}
+	return;
+}

+ 48 - 3
src/core/sched_ctx.c

@@ -606,6 +606,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 	{
 		starpu_sched_ctx_add_workers(workerids, nworkers_ctx, inheritor_sched_ctx_id);
 		starpu_sched_ctx_set_priority(workerids, nworkers_ctx, inheritor_sched_ctx_id, 1);
+		starpu_sched_ctx_set_priority_on_level(workerids, nworkers_ctx, inheritor_sched_ctx_id, 1);
 	}
 
 	if(!_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id))
@@ -700,12 +701,38 @@ void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx
 
 }
 
+void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority)
+{
+/* 	int w; */
+/* 	struct _starpu_worker *worker = NULL; */
+/* 	for(w = 0; w < nworkers_to_add; w++) */
+/* 	{ */
+/* 		worker = _starpu_get_worker_struct(workers_to_add[w]); */
+/* 		STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex); */
+/* 		struct _starpu_sched_ctx_list *l = NULL; */
+/* 		for (l = worker->sched_ctx_list; l; l = l->next) */
+/* 		{ */
+/* 			if(l->sched_ctx != STARPU_NMAX_SCHED_CTXS && l->sched_ctx != sched_ctx && */
+/* 			   starpu_sched_ctx_get_hierarchy_level(l->sched_ctx) == starpu_sched_ctx_get_hierarchy_level(sched_ctx)) */
+/* 			{ */
+/* 				/\* the lock is taken inside the func *\/ */
+/* 				STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex); */
+/* 				starpu_sched_ctx_set_priority(&workers_to_add[w], 1, l->sched_ctx, priority); */
+/* 				STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex); */
+/* 			} */
+/* 		} */
+/* 		STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex); */
+/* 	} */
+/* 	return; */
+
+}
 static void _set_priority_hierarchically(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority)
 {
 	if(starpu_sched_ctx_get_hierarchy_level(sched_ctx) > 0)
 	{
 		unsigned father = starpu_sched_ctx_get_inheritor(sched_ctx);
 		starpu_sched_ctx_set_priority(workers_to_add, nworkers_to_add, father, priority);
+		starpu_sched_ctx_set_priority_on_level(workers_to_add, nworkers_to_add, father, priority);
 		_set_priority_hierarchically(workers_to_add, nworkers_to_add, father, priority);
 	}
 	return;
@@ -733,6 +760,7 @@ void starpu_sched_ctx_add_workers(int *workers_to_add, int nworkers_to_add, unsi
 		{
 			_starpu_update_workers_with_ctx(added_workers, n_added_workers, sched_ctx->id);
 		}
+		starpu_sched_ctx_set_priority(workers_to_add, nworkers_to_add, sched_ctx_id, 1);
 		_set_priority_hierarchically(workers_to_add, nworkers_to_add, sched_ctx_id, 0);
 
 	}
@@ -769,7 +797,9 @@ void starpu_sched_ctx_remove_workers(int *workers_to_remove, int nworkers_to_rem
 		_starpu_remove_workers_from_sched_ctx(sched_ctx, workers_to_remove, nworkers_to_remove, removed_workers, &n_removed_workers);
 
 		if(n_removed_workers > 0)
-			_starpu_update_workers_without_ctx(removed_workers, n_removed_workers, sched_ctx->id, 0);
+		{
+			_starpu_update_workers_without_ctx(removed_workers, n_removed_workers, sched_ctx_id, 0);
+		}
 
 	}
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&changing_ctx_mutex[sched_ctx_id]);
@@ -846,6 +876,7 @@ void _starpu_decrement_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	int reached = _starpu_barrier_counter_get_reached_start(&sched_ctx->tasks_barrier);
 	int finished = reached == 1;
+
         /* when finished decrementing the tasks if the user signaled he will not submit tasks anymore
            we can move all its workers to the inheritor context */
 	if(finished && sched_ctx->inheritor != STARPU_NMAX_SCHED_CTXS)
@@ -1394,6 +1425,7 @@ void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ct
 		for(w = 0; w < nworkers; w++)
 		{
 			worker = _starpu_get_worker_struct(workers[w]);
+			STARPU_PTHREAD_MUTEX_LOCK(&worker->sched_mutex);
 			struct _starpu_sched_ctx_list *l = NULL;
 			for (l = worker->sched_ctx_list; l; l = l->next)
 			{
@@ -1403,11 +1435,26 @@ void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ct
 					break;
 				}
 			}
+			STARPU_PTHREAD_MUTEX_UNLOCK(&worker->sched_mutex);
 		}
 	}
 	return;
 }
 
+unsigned starpu_sched_ctx_get_priority(int workerid, unsigned sched_ctx_id)
+{
+	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
+	struct _starpu_sched_ctx_list *l = NULL;
+	for (l = worker->sched_ctx_list; l; l = l->next)
+	{
+		if(l->sched_ctx == sched_ctx_id)
+		{
+			return l->priority;
+		}
+	}
+	return 1;
+}
+
 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker)
 {
 	struct _starpu_sched_ctx_list *l = NULL;
@@ -1597,5 +1644,3 @@ void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, uns
 
 	return ret;
 }
-
-

+ 32 - 2
src/core/sched_policy.c

@@ -25,6 +25,8 @@
 #include <core/debug.h>
 
 static int use_prefetch = 0;
+double idle[STARPU_NMAXWORKERS];
+double idle_start[STARPU_NMAXWORKERS];
 
 int starpu_get_prefetch_flag(void)
 {
@@ -627,7 +629,8 @@ struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker
 		}
 	}
 
-	if(worker->pop_ctx_priority == 0 && first_sched_ctx == STARPU_NMAX_SCHED_CTXS)
+//	if(worker->pop_ctx_priority == 0 && first_sched_ctx == STARPU_NMAX_SCHED_CTXS)
+	if(first_sched_ctx == STARPU_NMAX_SCHED_CTXS)
 		first_sched_ctx = worker->sched_ctx_list->sched_ctx;
 
 	worker->poped_in_ctx[first_sched_ctx] = !worker->poped_in_ctx[first_sched_ctx];
@@ -729,9 +732,18 @@ pick:
 
 
 	if (!task)
+	{
+		idle_start[worker->workerid] = starpu_timing_now();
 		return NULL;
+	}
 
-
+	if(idle_start[worker->workerid] != 0.0)
+	{
+		double idle_end = starpu_timing_now();
+		idle[worker->workerid] += (idle_end - idle_start[worker->workerid]);
+		idle_start[worker->workerid] = 0.0;
+	}
+	
 
 #ifdef STARPU_USE_SC_HYPERVISOR
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
@@ -866,3 +878,21 @@ int starpu_push_local_task(int workerid, struct starpu_task *task, int prio)
 
 	return  _starpu_push_local_task(worker, task, prio);
 }
+
+void _starpu_print_idle_time()
+{
+	double all_idle = 0.0;
+	int i = 0;
+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+		all_idle += idle[i];
+
+	FILE *f;
+	const char *sched_env = getenv("IDLE_FILE");
+	if(!sched_env)
+		f = fopen("idle_microsec", "a");
+	else
+		f = fopen(sched_env, "a");
+	fprintf(f, "%lf \n", all_idle);
+	fclose(f);
+	
+}

+ 1 - 0
src/core/sched_policy.h

@@ -54,6 +54,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 
 void _starpu_sched_pre_exec_hook(struct starpu_task *task);
 
+void _starpu_print_idle_time();
 /*
  *	Predefined policies
  */

+ 1 - 1
src/core/workers.c

@@ -1342,7 +1342,7 @@ void starpu_shutdown(void)
 	if (_starpu_scc_common_is_mp_initialized())
 		_starpu_scc_src_mp_deinit();
 #endif
-
+//	_starpu_print_idle_time();
 	_STARPU_DEBUG("Shutdown finished\n");
 }
 

+ 1 - 1
src/worker_collection/worker_tree.c

@@ -64,7 +64,7 @@ static int tree_get_next(struct starpu_worker_collection *workers, struct starpu
 
 	ret = _starpu_worker_get_workerid(neighbour->id);
 	STARPU_ASSERT_MSG(ret != -1, "bind id not correct");
-	it->visited[ret] = 1;
+	it->visited[neighbour->id] = 1;
 
 	return ret;
 }