12 years ago · fbe2657f9f
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -212,6 +212,8 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
 
				+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -68,14 +68,14 @@ struct starpu_performance_counters
 
				 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
			
 
				 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
			
 
				 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
			
 
				-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
			
 
				+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
			
 
				 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
			
 
				 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
			
 
				-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
			
 
				+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
			
 
				 void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -127,6 +127,9 @@ struct sched_ctx_hypervisor_wrapper
 
				 	/* number of flops executed since last resizing */
			
 
				 	double elapsed_flops[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* data quantity executed on each worker in this ctx */
			
 
				+	size_t elapsed_data[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	/* the average speed of workers when they belonged to this context */
			
 
				 	double ref_velocity[STARPU_NMAXWORKERS];
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -56,7 +56,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
			
 
				 			}
			
 
				 			
			
 
				-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				 		}
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
			
 
				 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c
@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 		{
			
 
				 			tmp_nw_move[w] = 0;
			
 
				 			tmp_nw_add[w] = 0;
			
 
				+			int i;
			
 
				+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+			{
			
 
				+				tmp_workers_move[w][i] = -1;
			
 
				+				tmp_workers_add[w][i] = -1;
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		/* find workers that ctx s has to give away */
			
@@ -413,7 +419,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 				
			
 
				 				if(nw_move > 0)
			
 
				 				{
			
 
				-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
			
 
				+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
			
 
				 					nw_move = 0;
			
 
				 				}
			
 
				 
			
@@ -452,7 +458,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 
				 		}
			
 
				 
			
 
				 		if(nw_move > 0)
			
 
				-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
			
 
				+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -457,6 +457,7 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				 		return -1.0;
			
 
				 
			
 
				         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
			
 
				+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
			
 
				 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
			
 
				 
			
@@ -479,6 +480,13 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
 
				                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				+		if(arch == STARPU_CUDA_WORKER)
			
 
				+		{	
			
 
				+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
			
 
				+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
			
 
				+		}
			
 
				+			
			
 
				                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
 
				                 return vel;
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -23,7 +23,7 @@ struct starpu_performance_counters* perf_counters = NULL;
 
				 
			
 
				 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
			
 
				 static void notify_pushed_task(unsigned sched_ctx, int worker);
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
			
 
				 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
			
 
				 static void notify_idle_end(unsigned sched_ctx, int  worker);
			
 
				 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
			
@@ -158,6 +158,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 
				 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
			
 
				+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
			
 
				 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
			
@@ -364,7 +365,11 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 
				 {
			
 
				 	int i;
			
 
				 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
			
 
				+	{
			
 
				 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
			
 
				+		if(val == 0)
			
 
				+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
			
@@ -396,7 +401,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 
				 	sender_sc_w->start_time = start_time;
			
 
				 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
			
 
				-	
			
 
				+
			
 
				 	receiver_sc_w->start_time = start_time;
			
 
				 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
			
 
				 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
			
@@ -415,14 +420,6 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 
				 			printf(" %d", workers_to_move[j]);
			
 
				 		printf("\n");
			
 
				 
			
 
				-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
			
 
				-/* 		int ncpus; */
			
 
				-
			
 
				-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
			
 
				-
			
 
				-/* //		if(ncpus != 0) */
			
 
				-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
			
 
				-
			
 
				 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
			
 
				 
			
 
				 		if(now)
			
@@ -715,10 +712,11 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 
				 }
			
 
				 
			
 
				 /* notifies the hypervisor that a task was poped from the queue of the worker */
			
 
				-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
			
 
				+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
			
 
				 {
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				 
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -211,7 +211,13 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		_starpu_sched_post_exec_hook(task);
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 		int workerid = starpu_worker_get_id();
			
 
				-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
			
 
				+		int i;
			
 
				+		size_t data_size = 0;
			
 
				+		for(i = 0; i < STARPU_NMAXBUFS; i++)
			
 
				+			if(task->handles[i] != NULL)
			
 
				+				data_size += _starpu_data_get_size(task->handles[i]);
			
 
				+
			
 
				+		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
			
 
				 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 	}
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1344,6 +1344,11 @@ static void write_bus_bandwidth_file_content(void)
 
				 }
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
			
 
				+{
			
 
				+	return bandwidth_matrix[0][cudadev+1];
			
 
				+}
			
 
				+
			
 
				 void starpu_bus_print_bandwidth(FILE *f)
			
 
				 {
			
 
				 	unsigned src, dst, maxnode;
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -1020,12 +1020,12 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
				 
			
 
				 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
			
 
				 
			
 
				-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
			
 
				+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
			
 
				 	   && sched_ctx->perf_counters != NULL)
			
 
				-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
			
 
				+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
			
 
				 }
			
 
				 
			
 
				 void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)