12 years ago · cc8ea216d1
--- a/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -93,7 +93,7 @@ void* start_bench(void *val)
 
				 	pthread_setspecific(key, &p->id);
			
 
				 
			
 
				 	if(p->ctx != 0)
			
 
				-		starpu_sched_ctx_set_task_context(&p->ctx);
			
 
				+		starpu_sched_ctx_set_context(&p->ctx);
			
 
				 
			
 
				 	for(i = 0; i < NSAMPLES; i++)
			
 
				 		p->bench(p->size, p->nblocks);
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -212,7 +212,9 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
 
				+/* use bw & latency to compute the velocity of resources*/
			
 
				 double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
			
 
				+double starpu_get_latency_RAM_CUDA(unsigned cudadev);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -101,9 +101,9 @@ struct starpu_sched_ctx_worker_collection* starpu_sched_ctx_get_worker_collectio
 
				 pthread_mutex_t* starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				 #endif
			
 
				 
			
 
				-void starpu_sched_ctx_set_task_context(unsigned *sched_ctx_id);
			
 
				+void starpu_sched_ctx_set_context(unsigned *sched_ctx_id);
			
 
				 
			
 
				-unsigned starpu_sched_ctx_get_task_context(void);
			
 
				+unsigned starpu_sched_ctx_get_context(void);
			
 
				 
			
 
				 void starpu_sched_ctx_notify_hypervisor_exists(void);
			
 
				 
			
--- a/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
+++ b/sched_ctx_hypervisor/examples/app_driven_test/app_driven_test.c
@@ -48,7 +48,7 @@ int tag = 1;
 
				 void* start_thread(void *arg)
			
 
				 {
			
 
				 	unsigned sched_ctx = *((unsigned*)arg);
			
 
				-	starpu_sched_ctx_set_task_context(&sched_ctx);
			
 
				+	starpu_sched_ctx_set_context(&sched_ctx);
			
 
				 
			
 
				 	struct starpu_task *task[10];
			
 
				 	struct params params[10];
			
--- a/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
+++ b/sched_ctx_hypervisor/examples/sched_ctx_utils/sched_ctx_utils.c
@@ -97,7 +97,7 @@ void* start_bench(void *val)
 
				 	pthread_setspecific(key, &p->id);
			
 
				 
			
 
				 	if(p->ctx != 0)
			
 
				-		starpu_sched_ctx_set_task_context(&p->ctx);
			
 
				+		starpu_sched_ctx_set_context(&p->ctx);
			
 
				 
			
 
				 	for(i = 0; i < NSAMPLES; i++)
			
 
				 		p->bench(p->mat[i], p->size, p->nblocks);
			
--- a/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
+++ b/sched_ctx_hypervisor/include/sched_ctx_hypervisor.h
@@ -130,6 +130,9 @@ struct sched_ctx_hypervisor_wrapper
 
				 	/* data quantity executed on each worker in this ctx */
			
 
				 	size_t elapsed_data[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* nr of tasks executed on each worker in this ctx */
			
 
				+	int elapsed_tasks[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	/* the average speed of workers when they belonged to this context */
			
 
				 	double ref_velocity[STARPU_NMAXWORKERS];
			
 
				 
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -36,6 +36,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 
			
 
				 	for(s = 0; s < ns; s++)
			
 
				 	{
			
 
				+		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 		for(w = 0; w < nw; w++)
			
 
				 		{
			
 
				 			w_in_s[s][w] = 0.0;
			
@@ -44,7 +45,6 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 			draft_flops_on_w[s][w] = 0.0;
			
 
				 			int worker = workers == NULL ? w : workers[w];
			
 
				 
			
 
				-			sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 			velocity[s][w] = _get_velocity_per_worker(sc_w, worker);
			
 
				 			if(velocity[s][w] == -1.0)
			
 
				 			{
			
@@ -53,13 +53,15 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 				if(velocity[s][w] == -1.0)
			
 
				 					velocity[s][w] = sc_w->ref_velocity[worker];
			
 
				 				if(velocity[s][w] == -1.0)
			
 
				-					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 150.0;
			
 
				+					velocity[s][w] = arch == STARPU_CPU_WORKER ? 5.0 : 100.0;
			
 
				 			}
			
 
				 			
			
 
				-//			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				+			printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
			
 
				 		}
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sched_ctxs[s]);
			
 
				-		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
			
 
				+//		flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
			
 
				+		flops[s] = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w)/1000000000.0; // in gflops 
			
 
				+		printf("%d: elapsed flops %lf\n", sched_ctxs[s], flops[s]);
			
 
				 	}
			
 
				 
			
 
				 
			
@@ -67,8 +69,10 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 	   as starting point and then try to minimize it
			
 
				 	   as increasing it a little for the faster ctxs */
			
 
				 	double tmax = _get_slowest_ctx_exec_time();
			
 
				-	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
			
 
				-//	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
			
 
				+/* 	double smallest_tmax = _get_fastest_ctx_exec_time(); //tmax - 0.5*tmax; */
			
 
				+	double smallest_tmax = tmax - 0.5*tmax;
			
 
				+
			
 
				+	printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
			
 
				 
			
 
				 	double res = 1.0;
			
 
				 	unsigned has_sol = 0;
			
@@ -150,7 +154,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 
				 	int w, s;
			
 
				 	glp_prob *lp;
			
 
				 
			
 
				-//	printf("try with tmax %lf\n", tmax);
			
 
				+	printf("try with tmax %lf\n", tmax);
			
 
				 	lp = glp_create_prob();
			
 
				 	glp_set_prob_name(lp, "StarPU theoretical bound");
			
 
				 	glp_set_obj_dir(lp, GLP_MAX);
			
@@ -332,7 +336,7 @@ static double _glp_resolve(int ns, int nw, double velocity[ns][nw], double flops
 
				 				w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum(w, s));
			
 
				 			else
			
 
				 				w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum(w,s));
			
 
				-//			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
			
 
				+			printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
			
 
				 		}
			
 
				 
			
 
				 	glp_delete_prob(lp);
			
@@ -397,13 +401,15 @@ static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker)
 
				 						}
			
 
				 					}
			
 
				 				}
			
 
				-/* 				for(s = 0; s < ns; s++) */
			
 
				-/* 					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
			
 
				-/* 					       nworkers_rounded[s][1], nworkers_rounded[s][0]); */
			
 
				+				for(s = 0; s < ns; s++)
			
 
				+					printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0],
			
 
				+					       nworkers_rounded[s][1], nworkers_rounded[s][0]);
			
 
				 
			
 
				 				_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
			
 
				 
			
 
				 			}
			
 
				+			else
			
 
				+				printf("no sol\n");
			
 
				 		}
			
 
				 		pthread_mutex_unlock(&act_hypervisor_mutex);
			
 
				 	}
			
--- a/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
+++ b/sched_ctx_hypervisor/src/hypervisor_policies/policy_tools.c
@@ -410,13 +410,12 @@ double _get_slowest_ctx_exec_time(void)
 
				 	{
			
 
				 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 
			
 
				-/*                 double elapsed_time = curr_time - sc_w->start_time; */
			
 
				-/* 		if(elapsed_time > slowest_time) */
			
 
				-/* 			slowest_time = elapsed_time; */
			
 
				-
			
 
				-//		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);
			
 
				-		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				+		double elapsed_time  = (curr_time - sc_w->start_time)/1000000;
			
 
				+/* 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx); */
			
 
				+/* 		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w); */
			
 
				+/* 		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w); */
			
 
				+/* 		double velocity = _get_ctx_velocity(sc_w); */
			
 
				+/*                 double elapsed_time = (elapsed_flops/1000000000.0)/velocity; */
			
 
				 		if(elapsed_time > slowest_time)
			
 
				 			slowest_time = elapsed_time;
			
 
				 
			
@@ -440,8 +439,11 @@ double _get_fastest_ctx_exec_time(void)
 
				 		sc_w = sched_ctx_hypervisor_get_wrapper(sched_ctxs[s]);
			
 
				 
			
 
				 		struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				-                double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				-
			
 
				+		double elapsed_time = (config->ispeed_ctx_sample/1000000000.0)/_get_ctx_velocity(sc_w);
			
 
				+/* 		double elapsed_flops = sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(sc_w);		 */
			
 
				+/* 		double velocity = _get_ctx_velocity(sc_w); */
			
 
				+/*                 double elapsed_time = (elapsed_flops/1000000000.0)/velocity; */
			
 
				+		
			
 
				 		if(elapsed_time < fastest_time)
			
 
				 			fastest_time = elapsed_time;
			
 
				 
			
@@ -458,6 +460,7 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				 
			
 
				         double elapsed_flops = sc_w->elapsed_flops[worker] / 1000000000.0; /*in gflops */
			
 
				 	size_t elapsed_data_used = sc_w->elapsed_data[worker];
			
 
				+	int elapsed_tasks = sc_w->elapsed_tasks[worker];
			
 
				 	struct sched_ctx_hypervisor_policy_config *config = sched_ctx_hypervisor_get_config(sc_w->sched_ctx);
			
 
				 	double sample = config->ispeed_w_sample[worker] / 1000000000.0; /*in gflops */
			
 
				 
			
@@ -480,12 +483,16 @@ double _get_velocity_per_worker(struct sched_ctx_hypervisor_wrapper *sc_w, unsig
 
				         {
			
 
				                 double curr_time = starpu_timing_now();
			
 
				                 double elapsed_time = (curr_time - sc_w->start_time) / 1000000.0; /* in seconds */
			
 
				- 		enum starpu_archtype arch = starpu_worker_get_type(worker);
			
 
				-		if(arch == STARPU_CUDA_WORKER)
			
 
				-		{	
			
 
				-			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker);
			
 
				-			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ;
			
 
				-		}
			
 
				+/*  		enum starpu_archtype arch = starpu_worker_get_type(worker); */
			
 
				+/* 		if(arch == STARPU_CUDA_WORKER) */
			
 
				+/* 		{	 */
			
 
				+/* 			double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker); */
			
 
				+/* 			elapsed_time +=  (elapsed_data_used / transfer_velocity) / 1000000 ; */
			
 
				+/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
			
 
				+/* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
			
 
				+/* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
			
 
				+/* //			printf("elapsed time after %lf \n", elapsed_time); */
			
 
				+/* 		} */
			
 
				 			
			
 
				                 double vel  = (elapsed_flops/elapsed_time);/* in Gflops/s */
			
 
				 		sc_w->ref_velocity[worker] = sc_w->ref_velocity[worker] > 0.0 ? (sc_w->ref_velocity[worker] + vel) / 2 : vel; 
			
--- a/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
+++ b/sched_ctx_hypervisor/src/sched_ctx_hypervisor.c
@@ -159,6 +159,7 @@ struct starpu_sched_ctx_performance_counters* sched_ctx_hypervisor_init(struct s
 
				 			hypervisor.sched_ctx_w[i].poped_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].elapsed_flops[j] = 0.0;
			
 
				 			hypervisor.sched_ctx_w[i].elapsed_data[j] = 0;
			
 
				+			hypervisor.sched_ctx_w[i].elapsed_tasks[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].total_elapsed_flops[j] = 0.0;
			
 
				 			hypervisor.sched_ctx_w[i].worker_to_be_removed[j] = 0;
			
 
				 			hypervisor.sched_ctx_w[i].ref_velocity[j] = -1.0;
			
@@ -368,7 +369,10 @@ static void _set_elapsed_flops_per_sched_ctx(unsigned sched_ctx, double val)
 
				 	{
			
 
				 		hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[i] = val;
			
 
				 		if(val == 0)
			
 
				+		{
			
 
				 			hypervisor.sched_ctx_w[sched_ctx].elapsed_data[i] = 0;
			
 
				+			hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[i] = 0;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -717,6 +721,7 @@ static void notify_poped_task(unsigned sched_ctx, int worker, double elapsed_flo
 
				 	hypervisor.sched_ctx_w[sched_ctx].poped_tasks[worker]++;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_flops[worker] += elapsed_flops;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].elapsed_data[worker] += data_size ;
			
 
				+	hypervisor.sched_ctx_w[sched_ctx].elapsed_tasks[worker]++ ;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].total_elapsed_flops[worker] += elapsed_flops;
			
 
				 	hypervisor.sched_ctx_w[sched_ctx].remaining_flops -= elapsed_flops; //sched_ctx_hypervisor_get_elapsed_flops_per_sched_ctx(&hypervisor.sched_ctx_w[sched_ctx]);
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1349,6 +1349,11 @@ double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
 
				 	return bandwidth_matrix[0][cudadev+1];
			
 
				 }
			
 
				 
			
 
				+double starpu_get_latency_RAM_CUDA(unsigned cudadev)
			
 
				+{
			
 
				+	return latency_matrix[0][cudadev+1];
			
 
				+}
			
 
				+
			
 
				 void starpu_bus_print_bandwidth(FILE *f)
			
 
				 {
			
 
				 	unsigned src, dst, maxnode;
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -751,12 +751,12 @@ void _starpu_increment_nsubmitted_tasks_of_sched_ctx(unsigned sched_ctx_id)
 
				 	_starpu_barrier_counter_increment(&sched_ctx->tasks_barrier);
			
 
				 }
			
 
				 
			
 
				-void starpu_sched_ctx_set_task_context(unsigned *sched_ctx)
			
 
				+void starpu_sched_ctx_set_context(unsigned *sched_ctx)
			
 
				 {
			
 
				 	pthread_setspecific(sched_ctx_key, (void*)sched_ctx);
			
 
				 }
			
 
				 
			
 
				-unsigned starpu_sched_ctx_get_task_context()
			
 
				+unsigned starpu_sched_ctx_get_context()
			
 
				 {
			
 
				 	unsigned *sched_ctx = (unsigned*)pthread_getspecific(sched_ctx_key);
			
 
				 	if(sched_ctx == NULL)
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -365,7 +365,7 @@ int starpu_task_submit(struct starpu_task *task)
 
				 
			
 
				 	if (task->sched_ctx == 0 && nsched_ctxs != 1 && !j->exclude_from_dag)
			
 
				 	{
			
 
				-		set_sched_ctx = starpu_sched_ctx_get_task_context();
			
 
				+		set_sched_ctx = starpu_sched_ctx_get_context();
			
 
				 		if (set_sched_ctx != STARPU_NMAX_SCHED_CTXS)
			
 
				 			task->sched_ctx = set_sched_ctx;
			
 
				 	}
			
@@ -633,7 +633,7 @@ void starpu_display_codelet_stats(struct starpu_codelet *cl)
 
				 int starpu_task_wait_for_all(void)
			
 
				 {
			
 
				 	unsigned nsched_ctxs = _starpu_get_nsched_ctxs();
			
 
				-	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_task_context();
			
 
				+	unsigned sched_ctx_id = nsched_ctxs == 1 ? 0 : starpu_sched_ctx_get_context();
			
 
				 
			
 
				 	/* if there is no indication about which context to wait,
			
 
				 	   we wait for all tasks submitted to starpu */
			
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -82,7 +82,7 @@ static void synthesize_intermediate_workers(hwloc_obj_t *children, unsigned min,
 
				 		{
			
 
				 			if (nworkers >= min && nworkers <= max)
			
 
				 			{
			
 
				-				unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
			
 
				+				unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
			
 
				 				if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 					sched_ctx_id = 0;
			
 
				 				struct starpu_sched_ctx_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
@@ -134,7 +134,7 @@ static void find_and_assign_combinations(hwloc_obj_t obj, unsigned min, unsigned
 
				 	if (nworkers >= min && nworkers <= max)
			
 
				 	{
			
 
				 		_STARPU_DEBUG("Adding it\n");
			
 
				-		unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
			
 
				+		unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
			
 
				 		if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 			sched_ctx_id = 0;
			
 
				 
			
@@ -194,7 +194,7 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
				 
			
 
				 static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
			
 
				 {
			
 
				-	unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
			
 
				+	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
			
 
				 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 		sched_ctx_id = 0;
			
 
				 	int min;
			
@@ -250,7 +250,7 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 
				 
			
 
				 static void combine_all_cpu_workers(int *workerids, int nworkers)
			
 
				 {
			
 
				-	unsigned sched_ctx_id  = starpu_sched_ctx_get_task_context();
			
 
				+	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
			
 
				 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 		sched_ctx_id = 0;
			
 
				 	struct starpu_sched_ctx_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);