12 years ago · 6acf4198af
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -97,22 +97,53 @@ struct sched_ctx_hypervisor_resize_ack
 
				 	int *acked_workers;
			
 
				 };
			
 
				 
			
 
				+/* wrapper attached to a sched_ctx storing monitoring information */
			
 
				 struct sched_ctx_hypervisor_wrapper
			
 
				 {
			
 
				+	/* the sched_ctx it monitors */
			
 
				 	unsigned sched_ctx;
			
 
				+
			
 
				+	/* user configuration meant to limit resizing */
			
 
				 	struct sched_ctx_hypervisor_policy_config *config;
			
 
				+
			
 
				+	/* idle time of workers in this context */
			
 
				 	double current_idle_time[STARPU_NMAXWORKERS];
			
 
				+	
			
 
				+	/* list of workers that will leave this contexts (lazy resizing process) */
			
 
				 	int worker_to_be_removed[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* number of tasks pushed on each worker in this ctx */
			
 
				 	int pushed_tasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* number of tasks poped from each worker in this ctx */
			
 
				 	int poped_tasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* number of flops the context has to execute */
			
 
				 	double total_flops;
			
 
				+
			
 
				+	/* number of flops executed since the biginning until now */
			
 
				 	double total_elapsed_flops[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* number of flops executed since last resizing */
			
 
				 	double elapsed_flops[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* the average speed of workers when they belonged to this context */
			
 
				 	double ref_velocity[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* number of flops submitted to this ctx */
			
 
				 	double submitted_flops;
			
 
				+
			
 
				+	/* number of flops that still have to be executed in this ctx */
			
 
				 	double remaining_flops;
			
 
				+	
			
 
				+	/* the start time of the resizing sample of this context*/
			
 
				 	double start_time;
			
 
				+
			
 
				+	/* the workers don't leave the current ctx until the receiver ctx 
			
 
				+	   doesn't ack the receive of these workers */
			
 
				 	struct sched_ctx_hypervisor_resize_ack resize_ack;
			
 
				+
			
 
				+	/* mutex to protect the ack of workers */
			
 
				 	pthread_mutex_t mutex;
			
 
				 };
			
 
				 
			
@@ -133,6 +164,8 @@ struct sched_ctx_hypervisor_policy
 
				 	void (*handle_post_exec_hook)(unsigned sched_ctx, int task_tag);
			
 
				 
			
 
				 	void (*handle_submitted_job)(struct starpu_task *task, unsigned footprint);
			
 
				+	
			
 
				+	void (*end_ctx)(unsigned sched_ctx);
			
 
				 };
			
 
				 
			
 
				 struct starpu_performance_counters *sched_ctx_hypervisor_init(struct sched_ctx_hypervisor_policy *policy);
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/app_driven_policy.c
@@ -32,6 +32,7 @@ struct sched_ctx_hypervisor_policy app_driven_policy =
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = app_driven_handle_post_exec_hook,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "app_driven"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/gflops_rate_policy.c
@@ -302,6 +302,7 @@ struct sched_ctx_hypervisor_policy gflops_rate_policy = {
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "gflops_rate"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/idle_policy.c
@@ -50,6 +50,7 @@ struct sched_ctx_hypervisor_policy idle_policy =
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "idle"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -56,7 +56,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
			
 
				 			}
			
 
				 			
			
 
				-//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				+			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				 		}
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
			
 
				 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
			
@@ -348,7 +348,8 @@ static double _find_tmax(double t1, double t2)
 
				 
			
 
				 static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker)
			
 
				 {
			
 
				-
			
 
				+	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				+	_get_velocity_per_worker(sc_w, worker);
			
 
				 	int ret = pthread_mutex_trylock(&act_hypervisor_mutex);
			
 
				 	if(ret != EBUSY)
			
 
				 	{
			
@@ -408,6 +409,15 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void ispeed_lp_end_ctx(unsigned sched_ctx)
			
 
				+{
			
 
				+	struct sched_ctx_hypervisor_wrapper* sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctx);
			
 
				+	int worker;
			
 
				+	for(worker = 0; worker < 12; worker++)
			
 
				+		printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]);
			
 
				+
			
 
				+	return;
			
 
				+}
			
 
				 
			
 
				 struct sched_ctx_hypervisor_policy ispeed_lp_policy = {
			
 
				 	.size_ctxs = NULL,
			
@@ -417,6 +427,7 @@ struct sched_ctx_hypervisor_policy ispeed_lp_policy = {
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = ispeed_lp_end_ctx,
			
 
				 	.custom = 0,
			
 
				 	.name = "ispeed_lp"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_policy.c
@@ -189,6 +189,7 @@ struct sched_ctx_hypervisor_policy ispeed_policy = {
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "ispeed"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp2_policy.c
@@ -588,6 +588,7 @@ struct sched_ctx_hypervisor_policy lp2_policy = {
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = lp2_handle_submitted_job,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "lp2"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_policy.c
@@ -94,6 +94,7 @@ struct sched_ctx_hypervisor_policy lp_policy = {
 
				 	.handle_idle_end = NULL,
			
 
				 	.handle_post_exec_hook = NULL,
			
 
				 	.handle_submitted_job = NULL,
			
 
				+	.end_ctx = NULL,
			
 
				 	.custom = 0,
			
 
				 	.name = "lp"
			
 
				 };
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
@@ -272,17 +272,26 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 	int s, s2, w;
			
 
				 	for(s = 0; s < ns; s++)
			
 
				 	{
			
 
				-		int workers_move[STARPU_NMAXWORKERS];
			
 
				-		int nw_move = 0;
			
 
				+		int tmp_workers_move[nw][STARPU_NMAXWORKERS];
			
 
				+		int tmp_nw_move[nw];
			
 
				+
			
 
				+		int tmp_workers_add[nw][STARPU_NMAXWORKERS];
			
 
				+		int tmp_nw_add[nw];
			
 
				 		
			
 
				-		int workers_add[STARPU_NMAXWORKERS];
			
 
				-		int nw_add = 0;
			
 
				 
			
 
				+		for(w = 0; w < nw; w++)		
			
 
				+		{
			
 
				+			tmp_nw_move[w] = 0;
			
 
				+			tmp_nw_add[w] = 0;
			
 
				+		}
			
 
				+
			
 
				+		/* find workers that ctx s has to give away */
			
 
				 		for(w = 0; w < nw; w++)
			
 
				 		{
			
 
				-			enum starpu_archtype arch;
			
 
				+			enum starpu_archtype arch = STARPU_ANY_WORKER;
			
 
				 			if(w == 0) arch = STARPU_CUDA_WORKER;
			
 
				 			if(w == 1) arch = STARPU_CPU_WORKER;
			
 
				+			
			
 
				 
			
 
				 			if(w == 1)
			
 
				 			{
			
@@ -293,7 +302,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 					int *workers_to_move = _get_first_workers(sched_ctxs[s], &nworkers_to_move, arch);
			
 
				 					int i;
			
 
				 					for(i = 0; i < nworkers_to_move; i++)
			
 
				-						workers_move[nw_move++] = workers_to_move[i];
			
 
				+						tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
			
 
				 					free(workers_to_move);
			
 
				 				}
			
 
				 			}
			
@@ -313,7 +322,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 						{
			
 
				 							int i;
			
 
				 							for(i = 0; i < x; i++)
			
 
				-								workers_move[nw_move++] = workers_to_move[i];
			
 
				+								tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
			
 
				 
			
 
				 						}
			
 
				 						free(workers_to_move);
			
@@ -326,13 +335,13 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 						{
			
 
				 							int i;
			
 
				 							for(i = 0; i < x-1; i++)
			
 
				-								workers_move[nw_move++] = workers_to_move[i];
			
 
				+								tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[i];
			
 
				 
			
 
				 							if(diff > 0.8)
			
 
				-								workers_move[nw_move++] = workers_to_move[x-1];
			
 
				+								tmp_workers_move[w][tmp_nw_move[w]++] = workers_to_move[x-1];
			
 
				 							else
			
 
				 								if(diff > 0.3)
			
 
				-									workers_add[nw_add++] = workers_to_move[x-1];
			
 
				+									tmp_workers_add[w][tmp_nw_add[w]++] = workers_to_move[x-1];
			
 
				 
			
 
				 						}
			
 
				 						free(workers_to_move);
			
@@ -345,29 +354,105 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 		{
			
 
				 			if(sched_ctxs[s2] != sched_ctxs[s])
			
 
				 			{
			
 
				-				double nworkers_ctx2 = sched_ctx_hypervisor_get_nworkers_ctx(sched_ctxs[s2], STARPU_ANY_WORKER) * 1.0;
			
 
				-				int total_res = 0;
			
 
				+				/* find workers that ctx s2 wants to accept from ctx s 
			
 
				+				   the rest of it will probably accepted by another ctx */
			
 
				+				int workers_move[STARPU_NMAXWORKERS];
			
 
				+				int nw_move = 0;
			
 
				+				
			
 
				+				int workers_add[STARPU_NMAXWORKERS];
			
 
				+				int nw_add = 0;
			
 
				+
			
 
				+				int w;
			
 
				 				for(w = 0; w < nw; w++)
			
 
				-					total_res += res[s2][w];
			
 
				-//				if(( total_res - nworkers_ctx2) >= 0.0 && nw_move > 0)
			
 
				+				{
			
 
				+					enum starpu_archtype arch = STARPU_ANY_WORKER;
			
 
				+					if(w == 0) arch = STARPU_CUDA_WORKER;
			
 
				+					if(w == 1) arch = STARPU_CPU_WORKER;
			
 
				+
			
 
				+					int nw_ctx2 = sched_ctx_hypervisor_get_nworkers_ctx(sched_ctxs[s2], arch);
			
 
				+					int nw_needed = res_rounded[s2][w] - nw_ctx2;
			
 
				+
			
 
				+					if( nw_needed > 0 && tmp_nw_move[w] > 0)
			
 
				+					{
			
 
				+						nw_move += nw_needed >= tmp_nw_move[w] ? tmp_nw_move[w] : nw_needed;
			
 
				+						int i = 0, j = 0;
			
 
				+						for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+						{
			
 
				+							if(tmp_workers_move[w][i] != -1)
			
 
				+							{
			
 
				+								workers_move[j++] = tmp_workers_move[w][i];
			
 
				+								tmp_workers_move[w][i] = -1;
			
 
				+								if(j == nw_move)
			
 
				+									break;
			
 
				+							}
			
 
				+						}
			
 
				+						tmp_nw_move[w] -=  nw_move;
			
 
				+					}
			
 
				+
			
 
				+					double needed = res[s2][w] - nw_ctx2 * 1.0;
			
 
				+					double x_double = (double)nw_needed;
			
 
				+					double diff = needed - x_double;
			
 
				+					if(diff > 0.3 && tmp_nw_add[w] != 0)
			
 
				+					{
			
 
				+						nw_add = tmp_nw_add[w];
			
 
				+						int i = 0, j = 0;
			
 
				+						for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+						{
			
 
				+							if(tmp_workers_add[w][i] != -1)
			
 
				+							{
			
 
				+								workers_add[j++] = tmp_workers_add[w][i];
			
 
				+								tmp_workers_add[w][i] = -1;
			
 
				+								if(j == nw_add)
			
 
				+									break;
			
 
				+							}
			
 
				+						}
			
 
				+						tmp_nw_add[w] -=  nw_add;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				
			
 
				 				if(nw_move > 0)
			
 
				 				{
			
 
				-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
			
 
				+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
			
 
				 					nw_move = 0;
			
 
				-//					break;
			
 
				 				}
			
 
				-//				if((total_res - nworkers_ctx2) >= 0.0 &&  (total_res - nworkers_ctx2) <= (double)nw_add && nw_add > 0)
			
 
				+
			
 
				 				if(nw_add > 0)
			
 
				 				{
			
 
				 					sched_ctx_hypervisor_add_workers_to_sched_ctx(workers_add, nw_add, sched_ctxs[s2]);
			
 
				 					nw_add = 0;
			
 
				-//					break;
			
 
				 				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* if there are workers that weren't accepted by anyone but ctx s wants
			
 
				+		   to get rid of them just remove them from ctx s */
			
 
				+		int workers_move[STARPU_NMAXWORKERS];
			
 
				+		int nw_move = 0;
			
 
				 				
			
 
				+		int w;
			
 
				+		for(w = 0; w < nw; w++)
			
 
				+		{
			
 
				+			if(tmp_nw_move[w] > 0)
			
 
				+			{
			
 
				+				nw_move += tmp_nw_move[w];
			
 
				+				int i = 0, j = 0;
			
 
				+				for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+				{
			
 
				+					if(tmp_workers_move[w][i] != -1)
			
 
				+					{
			
 
				+						workers_move[j++] = tmp_workers_move[w][i];
			
 
				+						tmp_workers_move[w][i] = -1;
			
 
				+						if(j == nw_move)
			
 
				+							break;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				 		if(nw_move > 0)
			
 
				-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
			
 
				+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -298,7 +298,7 @@ unsigned _resize_to_unknown_receiver(unsigned sender_sched_ctx, unsigned now)
 
				 	return _resize(sender_sched_ctx, STARPU_NMAX_SCHED_CTXS, 0, now);
			
 
				 }
			
 
				 
			
 
				-static double _get_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
			
 
				+static double _get_best_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w, int *npus, enum starpu_archtype req_arch)
			
 
				 {
			
 
				 	double ret_val = 0.0;
			
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
@@ -314,7 +314,8 @@ static double _get_elapsed_flops(struct sched_ctx_hypervisor_wrapper* sc_w, int
 
				                 enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				                 if(arch == req_arch)
			
 
				                 {
			
 
				-			ret_val += sc_w->elapsed_flops[worker];
			
 
				+			if(sc_w->elapsed_flops[worker] > ret_val)
			
 
				+				ret_val = sc_w->elapsed_flops[worker];
			
 
				 			(*npus)++;
			
 
				                 }
			
 
				         }
			
@@ -327,6 +328,8 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
				 	struct starpu_sched_ctx_worker_collection *workers = starpu_sched_ctx_get_worker_collection(sc_w->sched_ctx);
			
 
				         int worker;
			
 
				 
			
 
				+	double avg = 0.0;
			
 
				+	int n = 0;
			
 
				 	struct starpu_iterator it;
			
 
				 	if(workers->init_iterator)
			
 
				                 workers->init_iterator(workers, &it);
			
@@ -338,11 +341,12 @@ static double _get_ispeed_sample_for_type_of_worker(struct sched_ctx_hypervisor_
 
				                 if(arch == req_arch)
			
 
				                 {
			
 
				 			struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-			return config->ispeed_w_sample[worker];
			
 
				+			avg += config->ispeed_w_sample[worker];
			
 
				+			n++;
			
 
				 		}
			
 
				         }
			
 
				 
			
 
				-	return 0.0;
			
 
				+	return n != 0 ? avg/n : 0;
			
 
				 }
			
 
				 
			
 
				 static double _get_ispeed_sample_for_sched_ctx(unsigned sched_ctx)
			
@@ -377,6 +381,12 @@ double _get_ctx_velocity(struct sched_ctx_hypervisor_wrapper* sc_w)
 
				 /* 	double redim_sample = config->ispeed_ctx_sample != 0.0 ? config->ispeed_ctx_sample :  */
			
 
				 /* 		(elapsed_flops == total_elapsed_flops ? HYPERVISOR_START_REDIM_SAMPLE : HYPERVISOR_REDIM_SAMPLE); */
			
 
				 //	printf("%d: prc %lf sample %lf\n", sc_w->sched_ctx, prc, redim_sample);
			
 
				+
			
 
				+/* 	double curr_time2 = starpu_timing_now(); */
			
 
				+/* 	double elapsed_time2 = (curr_time2 - sc_w->start_time) / 1000000.0; /\* in seconds *\/ */
			
 
				+/* 	if(elapsed_time2 > 5.0 && elapsed_flops < sample) */
			
 
				+/* 		return (elapsed_flops/1000000000.0)/elapsed_time2;/\* in Gflops/s *\/ */
			
 
				+
			
 
				 	if(elapsed_flops >= sample)
			
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
@@ -469,8 +479,9 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
 
				                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				-                sc_w->ref_velocity[worker] = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				-                return sc_w->ref_velocity[worker];
			
 
				+                double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				+		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
 
				+                return vel;
			
 
				         }
			
 
				 
			
 
				         return 0.00000000000001;
			
@@ -482,15 +493,18 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				 double _get_velocity_per_worker_type(struct sched_ctx_hypervisor_wrapper* sc_w, enum starpu_archtype arch)
			
 
				 {
			
 
				         int npus = 0;
			
 
				-        double elapsed_flops = _get_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
			
 
				-	double avg_elapsed_flops = elapsed_flops / npus;
			
 
				+        double elapsed_flops = _get_best_elapsed_flops(sc_w, &npus, arch) / 1000000000.0 ; /* in gflops */
			
 
				+	if(npus == 0)
			
 
				+		return -1.0; 
			
 
				+
			
 
				 	double sample = _get_ispeed_sample_for_type_of_worker(sc_w, arch) / 1000000000.0;
			
 
				 
			
 
				-        if( avg_elapsed_flops >= sample)
			
 
				+        if( elapsed_flops != 0.0)
			
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
 
				                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				-                return avg_elapsed_flops/elapsed_time; /* in Gflops/s */
			
 
				+		double velocity = elapsed_flops/elapsed_time; /* in Gflops/s */
			
 
				+                return velocity;
			
 
				         }
			
 
				 
			
 
				         return -1.0;
			
@@ -525,7 +539,8 @@ int _velocity_gap_btw_ctxs()
 
				 					if(other_ctx_v != -1.0)
			
 
				 					{
			
 
				 						double gap = ctx_v < other_ctx_v ? other_ctx_v / ctx_v : ctx_v / other_ctx_v ;
			
 
				-						if(gap > 1.5)
			
 
				+//						if(gap > 1.5)
			
 
				+						if(gap > 3.0)
			
 
				 							return 1;
			
 
				 					}
			
 
				 				}
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -64,6 +64,7 @@ static void _load_hypervisor_policy(struct sched_ctx_hypervisor_policy *policy)
 
				 	hypervisor.policy.handle_idle_end = policy->handle_idle_end;
			
 
				 	hypervisor.policy.handle_post_exec_hook = policy->handle_post_exec_hook;
			
 
				 	hypervisor.policy.handle_submitted_job = policy->handle_submitted_job;
			
 
				+	hypervisor.policy.end_ctx = policy->end_ctx;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -129,7 +130,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 	hypervisor.min_tasks = 0;
			
 
				 	hypervisor.nsched_ctxs = 0;
			
 
				 	pthread_mutex_init(&act_hypervisor_mutex, NULL);
			
 
				-
			
 
				+	hypervisor.start_executing_time = starpu_timing_now();
			
 
				 	int i;
			
 
				 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				 	{
			
@@ -149,6 +150,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 		hypervisor.sched_ctx_w[i].resize_ack.nmoved_workers = 0;
			
 
				 		hypervisor.sched_ctx_w[i].resize_ack.acked_workers = NULL;
			
 
				 		pthread_mutex_init(&hypervisor.sched_ctx_w[i].mutex, NULL);
			
 
				+
			
 
				 		int j;
			
 
				 		for(j = 0; j < STARPU_NMAXWORKERS; j++)
			
 
				 		{
			
@@ -277,6 +279,8 @@ static void _rearange_sched_ctxs(int *sched_ctxs, int old_nsched_ctxs)
 
				 /* unregistered contexts will no longer be resized */
			
 
				 void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
			
 
				 {
			
 
				+	if(hypervisor.policy.end_ctx)
			
 
				+		hypervisor.policy.end_ctx(sched_ctx);
			
 
				 	pthread_mutex_lock(&act_hypervisor_mutex);
			
 
				 	unsigned i;
			
 
				 	for(i = 0; i < hypervisor.nsched_ctxs; i++)
			
@@ -303,6 +307,13 @@ void sched_ctx_hypervisor_unregister_ctx(unsigned sched_ctx)
 
				 	pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 }
			
 
				 
			
 
				+static void _print_current_time()
			
 
				+{
			
 
				+	double curr_time = starpu_timing_now();
			
 
				+	double elapsed_time = (curr_time - hypervisor.start_executing_time) / 1000000.0; /* in seconds */
			
 
				+	printf("Time: %lf \n", elapsed_time);
			
 
				+}
			
 
				+
			
 
				 static int get_ntasks( int *tasks)
			
 
				 {
			
 
				 	int ntasks = 0;
			
@@ -349,25 +360,68 @@ int sched_ctx_hypervisor_get_nworkers_ctx(unsigned sched_ctx, enum starpu_archty
 
				 	return nworkers_ctx;
			
 
				 }
			
 
				 
			
 
				+static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
			
 
				+{
			
 
				+	int i;
			
 
				+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
			
 
				+}
			
 
				+
			
 
				+double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
 
				+{
			
 
				+	double ret_val = 0.0;
			
 
				+	int i;
			
 
				+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+		ret_val += sc_w->elapsed_flops[i];
			
 
				+	return ret_val;
			
 
				+}
			
 
				+
			
 
				+double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
 
				+{
			
 
				+	double ret_val = 0.0;
			
 
				+	int i;
			
 
				+	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+		ret_val += sc_w->total_elapsed_flops[i];
			
 
				+	return ret_val;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sched_ctx)
			
 
				+{
			
 
				+	/* info concerning only the gflops_rate strateg */
			
 
				+	struct sched_ctx_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
			
 
				+	struct sched_ctx_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
			
 
				+	
			
 
				+	double start_time =  starpu_timing_now();
			
 
				+	sender_sc_w->start_time = start_time;
			
 
				+	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				+	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				+	
			
 
				+	receiver_sc_w->start_time = start_time;
			
 
				+	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				+	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
 
				+}
			
 
				+
			
 
				 /* actually move the workers: the cpus are moved, gpus are only shared  */
			
 
				 /* forbids another resize request before this one is take into account */
			
 
				 void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned receiver_sched_ctx, int* workers_to_move, unsigned nworkers_to_move, unsigned now)
			
 
				 {
			
 
				 	if(nworkers_to_move > 0 && hypervisor.resize[sender_sched_ctx])// && hypervisor.resize[receiver_sched_ctx])
			
 
				 	{
			
 
				+		_print_current_time();
			
 
				 		int j;
			
 
				 		printf("resize ctx %d with", sender_sched_ctx);
			
 
				 		for(j = 0; j < nworkers_to_move; j++)
			
 
				 			printf(" %d", workers_to_move[j]);
			
 
				 		printf("\n");
			
 
				 
			
 
				-		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int));
			
 
				-		int ncpus;
			
 
				+/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
			
 
				+/* 		int ncpus; */
			
 
				 
			
 
				-		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus);
			
 
				+/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
			
 
				 
			
 
				-//		if(ncpus != 0)
			
 
				-//			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx);
			
 
				+/* //		if(ncpus != 0) */
			
 
				+/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
			
 
				 
			
 
				 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
			
 
				 
			
@@ -380,6 +434,8 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 			printf("\n");
			
 
				 
			
 
				 			starpu_sched_ctx_remove_workers(workers_to_move, nworkers_to_move, sender_sched_ctx);
			
 
				+			
			
 
				+			_reset_resize_sample_info(sender_sched_ctx, receiver_sched_ctx);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -419,6 +475,7 @@ void sched_ctx_hypervisor_add_workers_to_sched_ctx(int* workers_to_add, unsigned
 
				 {
			
 
				 	if(nworkers_to_add > 0 && hypervisor.resize[sched_ctx])
			
 
				 	{
			
 
				+		_print_current_time();
			
 
				 		int j;
			
 
				 		printf("add to ctx %d:", sched_ctx);
			
 
				 		for(j = 0; j < nworkers_to_add; j++)
			
@@ -443,6 +500,7 @@ void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove,
 
				 {
			
 
				 	if(nworkers_to_remove > 0 && hypervisor.resize[sched_ctx])
			
 
				 	{
			
 
				+		_print_current_time();
			
 
				 		int nworkers=0;
			
 
				 		int workers[nworkers_to_remove];
			
 
				 
			
@@ -488,31 +546,6 @@ void sched_ctx_hypervisor_remove_workers_from_sched_ctx(int* workers_to_remove,
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
			
 
				-{
			
 
				-	int i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				-		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
			
 
				-}
			
 
				-
			
 
				-double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
 
				-{
			
 
				-	double ret_val = 0.0;
			
 
				-	int i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				-		ret_val += sc_w->elapsed_flops[i];
			
 
				-	return ret_val;
			
 
				-}
			
 
				-
			
 
				-double sched_ctx_hypervisor_get_total_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
 
				-{
			
 
				-	double ret_val = 0.0;
			
 
				-	int i;
			
 
				-	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				-		ret_val += sc_w->total_elapsed_flops[i];
			
 
				-	return ret_val;
			
 
				-}
			
 
				-
			
 
				 static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
			
 
				 {
			
 
				 	if(worker != -1 && !starpu_sched_ctx_contains_worker(worker, sched_ctx))
			
@@ -597,18 +630,7 @@ static unsigned _ack_resize_completed(unsigned sched_ctx, int worker)
 
				 
			
 
				 				starpu_sched_ctx_remove_workers(moved_workers, nmoved_workers, sender_sched_ctx);
			
 
				 
			
 
				-				/* info concerning only the gflops_rate strateg */
			
 
				-				struct sched_ctx_hypervisor_wrapper *sender_sc_w = &hypervisor.sched_ctx_w[sender_sched_ctx];
			
 
				-				struct sched_ctx_hypervisor_wrapper *receiver_sc_w = &hypervisor.sched_ctx_w[receiver_sched_ctx];
			
 
				-
			
 
				-				double start_time =  starpu_timing_now();
			
 
				-				sender_sc_w->start_time = start_time;
			
 
				-				sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				-				_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				-
			
 
				-				receiver_sc_w->start_time = start_time;
			
 
				-				receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				-				_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
 
				+				_reset_resize_sample_info(sender_sched_ctx, receiver_sched_ctx);
			
 
				 
			
 
				 				hypervisor.resize[sender_sched_ctx] = 1;
			
 
				 				//	hypervisor.resize[receiver_sched_ctx] = 1;
			
@@ -701,7 +723,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flo
 
				 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				 
			
 
				 	if(hypervisor.resize[sched_ctx])
			
 
				-	{
			
 
				+	{	
			
 
				 		if(hypervisor.policy.handle_poped_task)
			
 
				 			hypervisor.policy.handle_poped_task(sched_ctx, worker);
			
 
				 	}
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor_intern.h
@@ -70,6 +70,9 @@ struct sched_ctx_hypervisor
 
				 	pthread_mutex_t resize_mut[STARPU_NMAX_SCHED_CTXS];
			
 
				 	struct size_request *sr;
			
 
				 	int check_min_tasks[STARPU_NMAX_SCHED_CTXS];
			
 
				+
			
 
				+	/* time when the hypervisor started */
			
 
				+	double start_executing_time;
			
 
				 };
			
 
				 
			
 
				 struct sched_ctx_hypervisor_adjustment
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -914,13 +914,31 @@ unsigned starpu_sched_ctx_get_nshared_workers(unsigned sched_ctx_id, unsigned sc
 
				 
			
 
				 unsigned starpu_sched_ctx_contains_worker(int workerid, unsigned sched_ctx_id)
			
 
				 {
			
 
				-	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				-	unsigned i;
			
 
				-	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
 
				-	{
			
 
				-		if(worker->sched_ctx[i] && worker->sched_ctx[i]->id == sched_ctx_id)
			
 
				+/* 	struct _starpu_worker *worker = _starpu_get_worker_struct(workerid); */
			
 
				+/* 	unsigned i; */
			
 
				+/* 	for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++) */
			
 
				+/* 	{ */
			
 
				+/* 		if(worker->sched_ctx[i] && worker->sched_ctx[i]->id == sched_ctx_id) */
			
 
				+/* 			return 1; */
			
 
				+/* 	} */
			
 
				+        struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+
			
 
				+        struct starpu_sched_ctx_worker_collection *workers = sched_ctx->workers;
			
 
				+        int worker;
			
 
				+
			
 
				+	struct starpu_iterator it;
			
 
				+        if(workers->init_iterator)
			
 
				+                workers->init_iterator(workers, &it);
			
 
				+
			
 
				+
			
 
				+        while(workers->has_next(workers, &it))
			
 
				+        {
			
 
				+                worker = workers->get_next(workers, &it);
			
 
				+		if(worker == workerid)
			
 
				 			return 1;
			
 
				-	}
			
 
				+        }
			
 
				+
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -517,6 +517,8 @@ struct _starpu_sched_ctx* _get_next_sched_ctx_to_pop_into(struct _starpu_worker
 
				 	{
			
 
				 		sched_ctx = worker->sched_ctx[i];
			
 
				 
			
 
				+		if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS && worker->removed_from_ctx[sched_ctx->id])
			
 
				+			return sched_ctx;
			
 
				 		if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS &&
			
 
				 		   sched_ctx->pop_counter[worker->workerid] < worker->nsched_ctxs &&
			
 
				 		   smallest_counter > sched_ctx->pop_counter[worker->workerid])
			
@@ -563,6 +565,8 @@ pick:
 
				 	{
			
 
				 		struct _starpu_sched_ctx *sched_ctx;
			
 
				 
			
 
				+		unsigned lucky_ctx = STARPU_NMAX_SCHED_CTXS;
			
 
				+
			
 
				 		int been_here[STARPU_NMAX_SCHED_CTXS];
			
 
				 		int i;
			
 
				 		for(i = 0; i < STARPU_NMAX_SCHED_CTXS; i++)
			
@@ -578,7 +582,10 @@ pick:
 
				 			if(sched_ctx != NULL && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				 			{
			
 
				 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
			
 
				+				{
			
 
				 					task = sched_ctx->sched_policy->pop_task(sched_ctx->id);
			
 
				+					lucky_ctx = sched_ctx->id;
			
 
				+				}
			
 
				 			}
			
 
				 
			
 
				 			if(!task && worker->removed_from_ctx[sched_ctx->id])
			
@@ -596,7 +603,6 @@ pick:
 
				 			sched_ctx->pop_counter[worker->workerid]++;
			
 
				 
			
 
				 		}
			
 
				-
			
 
				 	  }
			
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -548,8 +548,7 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 				if (conversion_time > 0.0)
			
 
				 					local_task_length[worker_ctx][nimpl] += conversion_time;
			
 
				 			}
			
 
				-
			
 
				-
			
 
				+			
			
 
				 			double ntasks_end = fifo->ntasks / starpu_worker_get_relative_speedup(perf_arch);
			
 
				 
			
 
				 			if (ntasks_best == -1