Browse Source

ispeed considers data transfer

Andra Hugo 12 years ago
parent
commit
fbe2657f9f

+ 2 - 0
include/starpu_perfmodel.h

@@ -212,6 +212,8 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
+
 #ifdef __cplusplus
 }
 #endif

+ 2 - 2
include/starpu_sched_ctx.h

@@ -68,14 +68,14 @@ struct starpu_performance_counters
 	void (*notify_idle_cycle)(unsigned sched_ctx_id, int worker, double idle_time);
 	void (*notify_idle_end)(unsigned sched_ctx_id, int worker);
 	void (*notify_pushed_task)(unsigned sched_ctx_id, int worker);
-	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops);
+	void (*notify_poped_task)(unsigned sched_ctx_id, int worker, double flops, size_t data_size);
 	void (*notify_post_exec_hook)(unsigned sched_ctx_id, int taskid);
 	void (*notify_submitted_job)(struct starpu_task *task, uint32_t footprint);
 };
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 void starpu_set_perf_counters(unsigned sched_ctx_id, struct starpu_performance_counters *perf_counters);
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops);
+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size);
 void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 

+ 3 - 0
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -127,6 +127,9 @@ struct sched_ctx_hypervisor_wrapper
 	/* number of flops executed since last resizing */
 	double elapsed_flops[STARPU_NMAXWORKERS];
 
+	/* data quantity executed on each worker in this ctx */
+	size_t elapsed_data[STARPU_NMAXWORKERS];
+
 	/* the average speed of workers when they belonged to this context */
 	double ref_velocity[STARPU_NMAXWORKERS];
 

+ 1 - 1
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -56,7 +56,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
 			}
 			
-			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
 		}
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
 		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */

+ 8 - 2
sched_ctx_hypervisor/src/hypervisor_policies/lp_tools.c

@@ -283,6 +283,12 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		{
 			tmp_nw_move[w] = 0;
 			tmp_nw_add[w] = 0;
+			int i;
+			for(i = 0; i < STARPU_NMAXWORKERS; i++)
+			{
+				tmp_workers_move[w][i] = -1;
+				tmp_workers_add[w][i] = -1;
+			}
 		}
 
 		/* find workers that ctx s has to give away */
@@ -413,7 +419,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 				
 				if(nw_move > 0)
 				{
-					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 1);
+					sched_ctx_hypervisor_move_workers(sched_ctxs[s], sched_ctxs[s2], workers_move, nw_move, 0);
 					nw_move = 0;
 				}
 
@@ -452,7 +458,7 @@ void _lp_redistribute_resources_in_ctxs(int ns, int nw, int res_rounded[ns][nw],
 		}
 
 		if(nw_move > 0)
-			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 1);
+			sched_ctx_hypervisor_remove_workers_from_sched_ctx(workers_move, nw_move, sched_ctxs[s], 0);
 	}
 }
 

+ 8 - 0
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -457,6 +457,7 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 		return -1.0;
 
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
+	size_t elapsed_data_used = sc_w->elapsed_data[worker];
 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 
@@ -479,6 +480,13 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
         {
                 double curr_time = starpu_timing_now();
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
+ 		enum starpu_archtype arch = starpu_worker_get_type(worker);
+		if(arch == STARPU_CUDA_WORKER)
+		{	
+			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
+			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
+		}
+			
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
                 return vel;

+ 9 - 11
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -23,7 +23,7 @@ struct starpu_performance_counters* perf_counters = NULL;
 
 static void notify_idle_cycle(unsigned sched_ctx, int worker, double idle_time);
 static void notify_pushed_task(unsigned sched_ctx, int worker);
-static void notify_poped_task(unsigned sched_ctx, int worker, double flops);
+static void notify_poped_task(unsigned sched_ctx, int worker, double flops, size_t data_size);
 static void notify_post_exec_hook(unsigned sched_ctx, int taskid);
 static void notify_idle_end(unsigned sched_ctx, int  worker);
 static void notify_submitted_job(struct starpu_task *task, unsigned footprint);
@@ -158,6 +158,7 @@ struct starpu_performance_counters* sched_ctx_hypervisor_init(struct sched_ctx_h
 			hypervisor.sched_ctx_w[i].pushed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
+			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
@@ -364,7 +365,11 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 {
 	int i;
 	for(i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
+		if(val == 0)
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
+	}
 }
 
 double sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(struct sched_ctx_hypervisor_wrapper* sc_w)
@@ -396,7 +401,7 @@ void _reset_resize_sample_info(unsigned sender_sched_ctx, unsigned receiver_sche
 	sender_sc_w->start_time = start_time;
 	sender_sc_w->remaining_flops = sender_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sender_sc_w);
 	_set_elapsed_flops_per_sched_ctx(sender_sched_ctx, 0.0);
-	
+
 	receiver_sc_w->start_time = start_time;
 	receiver_sc_w->remaining_flops = receiver_sc_w->remaining_flops - sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(receiver_sc_w);
 	_set_elapsed_flops_per_sched_ctx(receiver_sched_ctx, 0.0);
@@ -415,14 +420,6 @@ void sched_ctx_hypervisor_move_workers(unsigned sender_sched_ctx, unsigned recei
 			printf(" %d", workers_to_move[j]);
 		printf("\n");
 
-/* 		int *cpus = (int*) malloc(nworkers_to_move * sizeof(int)); */
-/* 		int ncpus; */
-
-/* 		_get_cpus(workers_to_move, nworkers_to_move, cpus, &ncpus); */
-
-/* //		if(ncpus != 0) */
-/* //			starpu_sched_ctx_remove_workers(cpus, ncpus, sender_sched_ctx); */
-
 		starpu_sched_ctx_add_workers(workers_to_move, nworkers_to_move, receiver_sched_ctx);
 
 		if(now)
@@ -715,10 +712,11 @@ static void notify_pushed_task(unsigned sched_ctx, int worker)
 }
 
 /* notifies the hypervisor that a task was poped from the queue of the worker */
-static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops)
+static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flops, size_t data_size)
 {
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 

+ 7 - 1
src/core/jobs.c

@@ -211,7 +211,13 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		_starpu_sched_post_exec_hook(task);
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 		int workerid = starpu_worker_get_id();
-		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops);
+		int i;
+		size_t data_size = 0;
+		for(i = 0; i < STARPU_NMAXBUFS; i++)
+			if(task->handles[i] != NULL)
+				data_size += _starpu_data_get_size(task->handles[i]);
+
+		starpu_call_poped_task_cb(workerid, task->sched_ctx, task->flops, data_size);
 #endif //STARPU_USE_SCHED_CTX_HYPERVISOR
 	}
 

+ 5 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -1344,6 +1344,11 @@ static void write_bus_bandwidth_file_content(void)
 }
 #endif /* STARPU_SIMGRID */
 
+double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+{
+	return bandwidth_matrix[0][cudadev+1];
+}
+
 void starpu_bus_print_bandwidth(FILE *f)
 {
 	unsigned src, dst, maxnode;

+ 2 - 2
src/core/sched_ctx.c

@@ -1020,12 +1020,12 @@ void starpu_sched_ctx_finished_submit(unsigned sched_ctx_id)
 
 #ifdef STARPU_USE_SCHED_CTX_HYPERVISOR
 
-void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops)
+void starpu_call_poped_task_cb(int workerid, unsigned sched_ctx_id, double flops, size_t data_size)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	if(sched_ctx != NULL && sched_ctx_id != 0 && sched_ctx_id != STARPU_NMAX_SCHED_CTXS
 	   && sched_ctx->perf_counters != NULL)
-		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops);
+		sched_ctx->perf_counters->notify_poped_task(sched_ctx_id, workerid, flops, data_size);
 }
 
 void starpu_call_pushed_task_cb(int workerid, unsigned sched_ctx_id)