Andra Hugo 12 rokov pred
rodič
commit
cc8ea216d1

+ 1 - 1
examples/sched_ctx_utils/sched_ctx_utils.c

@@ -93,7 +93,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 
 	if(p->ctx != 0)
-		starpu_sched_ctx_set_task_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->size, p->nblocks);

+ 2 - 0
include/starpu_perfmodel.h

@@ -212,7 +212,9 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
+/* use bw & latency to compute the velocity of resources*/
 double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
 
 #ifdef __cplusplus
 }

+ 2 - 2
include/starpu_sched_ctx.h

@@ -101,9 +101,9 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collectio
 pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
 #endif
 
-void starpu_sched_ctx_set_task_context(unsigned *sched_ctx_id);
+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
 
-unsigned starpu_sched_ctx_get_task_context(void);
+unsigned starpu_sched_ctx_get_context(void);
 
 void starpu_sched_ctx_notify_hypervisor_exists(void);
 

+ 1 - 1
sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c

@@ -48,7 +48,7 @@ int tag = 1;
 void* start_thread(void *arg)
 {
 	unsigned sched_ctx = *((unsigned*)arg);
-	starpu_sched_ctx_set_task_context(&sched_ctx);
+	starpu_sched_ctx_set_context(&sched_ctx);
 
 	struct starpu_task *task[10];
 	struct params params[10];

+ 1 - 1
sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c

@@ -97,7 +97,7 @@ void* start_bench(void *val)
 	pthread_setspecific(key, &p->id);
 
 	if(p->ctx != 0)
-		starpu_sched_ctx_set_task_context(&p->ctx);
+		starpu_sched_ctx_set_context(&p->ctx);
 
 	for(i = 0; i < NSAMPLES; i++)
 		p->bench(p->mat[i], p->size, p->nblocks);

+ 3 - 0
sched_ctx_hypervisor/include/sched_ctx_hypervisor.h

@@ -130,6 +130,9 @@ struct sched_ctx_hypervisor_wrapper
 	/* data quantity executed on each worker in this ctx */
 	size_t elapsed_data[STARPU_NMAXWORKERS];
 
+	/* nr of tasks executed on each worker in this ctx */
+	int elapsed_tasks[STARPU_NMAXWORKERS];
+
 	/* the average speed of workers when they belonged to this context */
 	double ref_velocity[STARPU_NMAXWORKERS];
 

+ 17 - 11
sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -36,6 +36,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
 	for(s = 0; s < ns; s++)
 	{
+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 		for(w = 0; w < nw; w++)
 		{
 			w_in_s[s][w] = 0.0;
@@ -44,7 +45,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 			draft_flops_on_w[s][w] = 0.0;
 			int worker = workers == NULL ? w : workers[w];
 
-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
 			if(velocity[s][w] == -1.0)
 			{
@@ -53,13 +53,15 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 				if(velocity[s][w] == -1.0)
 					velocity[s][w] = sc_w->ref_velocity[worker];
 				if(velocity[s][w] == -1.0)
-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
 			}
 			
-//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
+			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
 		}
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
-		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
+//		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
+		flops[s] = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w)/1000000000.0; // in gflops 
+		printf("%d: elapsed flops %lf\n", sched_ctxs[s], flops[s]);
 	}
 
 
@@ -67,8 +69,10 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 	   as starting point and then try to minimize it
 	   as increasing it a little for the faster ctxs */
 	double tmax = _get_slowest_ctx_exec_time();
-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
-//	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
+/* 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; */
+	double smallest_tmax = tmax - 0.5*tmax;
+
+	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
 
 	double res = 1.0;
 	unsigned has_sol = 0;
@@ -150,7 +154,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 	int w, s;
 	glp_prob *lp;
 
-//	printf("try with tmax %lf\n", tmax);
+	printf("try with tmax %lf\n", tmax);
 	lp = glp_create_prob();
 	glp_set_prob_name(lp, "StarPU theoretical bound");
 	glp_set_obj_dir(lp, GLP_MAX);
@@ -332,7 +336,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum(w, s));
 			else
 				w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum(w,s));
-//			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
+			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
 		}
 
 	glp_delete_prob(lp);
@@ -397,13 +401,15 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker)
 						}
 					}
 				}
-/* 				for(s = 0; s < ns; s++) */
-/* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
-/* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
+				for(s = 0; s < ns; s++)
+					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0],
+					       nworkers_rounded[s][1], nworkers_rounded[s][0]);
 
 				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
 
 			}
+			else
+				printf("no sol\n");
 		}
 		pthread_mutex_unlock(&act_hypervisor_mutex);
 	}

+ 22 - 15
sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c

@@ -410,13 +410,12 @@ double _get_slowest_ctx_exec_time(void)
 	{
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
-/*                 double elapsed_time = curr_time - sc_w->start_time; */
-/* 		if(elapsed_time > slowest_time) */
-/* 			slowest_time = elapsed_time; */
-
-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
-		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
+/* 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx); */
+/* 		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w); */
+/* 		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w); */
+/* 		double velocity = _get_ctx_velocity(sc_w); */
+/*                 double elapsed_time = (elapsed_flops/1000000000.0)/velocity; */
 		if(elapsed_time > slowest_time)
 			slowest_time = elapsed_time;
 
@@ -440,8 +439,11 @@ double _get_fastest_ctx_exec_time(void)
 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
 
 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
-
+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
+/* 		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);		 */
+/* 		double velocity = _get_ctx_velocity(sc_w); */
+/*                 double elapsed_time = (elapsed_flops/1000000000.0)/velocity; */
+		
 		if(elapsed_time < fastest_time)
 			fastest_time = elapsed_time;
 
@@ -458,6 +460,7 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
 	size_t elapsed_data_used = sc_w->elapsed_data[worker];
+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
 
@@ -480,12 +483,16 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
         {
                 double curr_time = starpu_timing_now();
                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
- 		enum starpu_archtype arch = starpu_worker_get_type(worker);
-		if(arch == STARPU_CUDA_WORKER)
-		{	
-			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
-			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
-		}
+/*  		enum starpu_archtype arch = starpu_worker_get_type(worker); */
+/* 		if(arch == STARPU_CUDA_WORKER) */
+/* 		{	 */
+/* 			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker); */
+/* 			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ; */
+/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
+/* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
+/* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
+/* //			printf("elapsed time after %lf \n", elapsed_time); */
+/* 		} */
 			
                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 

+ 5 - 0
sched_ctx_hypervisor/src/sched_ctx_hypervisor.c

@@ -159,6 +159,7 @@ struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct s
 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
@@ -368,7 +369,10 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 	{
 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
 		if(val == 0)
+		{
 			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
+		}
 	}
 }
 
@@ -717,6 +721,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flo
 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
 

+ 5 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -1349,6 +1349,11 @@ double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
 	return bandwidth_matrix[0][cudadev+1];
 }
 
+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+{
+	return latency_matrix[0][cudadev+1];
+}
+
 void starpu_bus_print_bandwidth(FILE *f)
 {
 	unsigned src, dst, maxnode;

+ 2 - 2
src/core/sched_ctx.c

@@ -751,12 +751,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
 }
 
-void starpu_sched_ctx_set_task_context(unsigned *sched_ctx)
+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
 {
 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
 }
 
-unsigned starpu_sched_ctx_get_task_context()
+unsigned starpu_sched_ctx_get_context()
 {
 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
 	if(sched_ctx == NULL)

+ 2 - 2
src/core/task.c

@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
 	{
-		set_sched_ctx = starpu_sched_ctx_get_task_context();
+		set_sched_ctx = starpu_sched_ctx_get_context();
 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
 			task->sched_ctx = set_sched_ctx;
 	}
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 int starpu_task_wait_for_all(void)
 {
 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_task_context();
+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
 
 	/* if there is no indication about which context to wait,
 	   we wait for all tasks submitted to starpu */

+ 4 - 4
src/sched_policies/detect_combined_workers.c

@@ -82,7 +82,7 @@ static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned min,
 		{
 			if (nworkers >= min && nworkers <= max)
 			{
-				unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
+				unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 				if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 					sched_ctx_id = 0;
 				struct starpu_sched_ctx_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
@@ -134,7 +134,7 @@ static void find_and_assign_combinations(hwloc_obj_t obj, unsigned min, unsigned
 	if (nworkers >= min && nworkers <= max)
 	{
 		_STARPU_DEBUG("Adding it\n");
-		unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
+		unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 		if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 			sched_ctx_id = 0;
 
@@ -194,7 +194,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
 static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
 {
-	unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
+	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 		sched_ctx_id = 0;
 	int min;
@@ -250,7 +250,7 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 
 static void combine_all_cpu_workers(int *workerids, int nworkers)
 {
-	unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
+	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 		sched_ctx_id = 0;
 	struct starpu_sched_ctx_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);