12 years ago · 9c038ac2be
--- a/doc/doxygen/chapters/api/performance_model.doxy
+++ b/doc/doxygen/chapters/api/performance_model.doxy
@@ -260,20 +260,16 @@ existing set of measurements done in good conditions, that StarPU
 
				 could benefit from instead of doing on-line measurements. And example
			
 
				 of use can be seen in \ref PerformanceModelExample.
			
 
				 
			
 
				-\fn double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
			
 
				+\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
			
 
				 \ingroup API_Performance_Model
			
 
				-Used to compute the execution time of tasks
			
 
				+Return the bandwidth of data transfer between two memory nodes
			
 
				 
			
 
				-\fn double starpu_get_latency_RAM_CUDA(unsigned cudadev)
			
 
				+\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
			
 
				 \ingroup API_Performance_Model
			
 
				-Used to compute the execution time of tasks
			
 
				+Return the latency of data transfer between two memory nodes
			
 
				 
			
 
				-\fn double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
			
 
				-\ingroup API_Performance_Mode
			
 
				-Used to compute the execution time of tasks
			
 
				-
			
 
				-\fn double starpu_get_latency_CUDA_RAM(unsigned cudadev)
			
 
				+\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
			
 
				 \ingroup API_Performance_Model
			
 
				-Used to compute the execution time of tasks
			
 
				+Return the estimated time to transfer a given size between two memory nodes.
			
 
				 
			
 
				 */
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -169,11 +169,9 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
 
				-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
			
 
				-double starpu_get_latency_RAM_CUDA(unsigned cudadev);
			
 
				-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev);
			
 
				-double starpu_get_latency_CUDA_RAM(unsigned cudadev);
			
 
				-
			
 
				+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
			
 
				+double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
			
 
				+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c
@@ -284,7 +284,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 
				 					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
			
 
				 					if(!worker_in_ctx)
			
 
				 					{
			
 
				-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
			
 
				+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)) / 1000;
			
 
				 						speed[s][w] = (speed[s][w] * transfer_speed) / (speed[s][w] + transfer_speed);
			
 
				 					}
			
 
				 				}
			
--- a/sc_hypervisor/src/policies_utils/policy_tools.c
+++ b/sc_hypervisor/src/policies_utils/policy_tools.c
@@ -430,9 +430,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				 				{
			
 
				 					if(arch == STARPU_CUDA_WORKER)
			
 
				 					{
			
 
				-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker);
			
 
				+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
			
 
				 						transfer_time +=  (tp->data_size / transfer_speed) / 1000. ;
			
 
				-						double latency = starpu_get_latency_RAM_CUDA(worker);
			
 
				+						double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
			
 
				 						transfer_time += latency/1000.;
			
 
				 						
			
 
				 						
			
@@ -441,9 +441,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 
				 					{
			
 
				 						if(!starpu_sched_ctx_contains_type_of_worker(arch, tp->sched_ctx_id))
			
 
				 						{
			
 
				-							double transfer_speed = starpu_get_bandwidth_CUDA_RAM(worker);
			
 
				+							double transfer_speed = starpu_transfer_bandwidth(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
			
 
				 							transfer_time += (tp->data_size / transfer_speed) / 1000. ;
			
 
				-							double latency = starpu_get_latency_CUDA_RAM(worker);
			
 
				+							double latency = starpu_transfer_latency(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
			
 
				 							transfer_time += latency / 1000.;
			
 
				 						}
			
 
				 					}
			
--- a/sc_hypervisor/src/policies_utils/speed.c
+++ b/sc_hypervisor/src/policies_utils/speed.c
@@ -74,10 +74,10 @@ double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, un
 
				 /* /\* 			if(!worker_in_ctx) *\/ */
			
 
				 /* /\* 			{ *\/ */
			
 
				 
			
 
				-/* /\* 				double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
			
 
				+/* /\* 				double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); *\/ */
			
 
				 /* /\* 				elapsed_time +=  (elapsed_data_used / transfer_speed) / 1000000 ; *\/ */
			
 
				 /* /\* 			} *\/ */
			
 
				-/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
			
 
				+/* 			double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); */
			
 
				 /* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
			
 
				 /* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
			
 
				 /* //			printf("elapsed time after %lf \n", elapsed_time); */
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -274,7 +274,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 
				 		return 0.0;
			
 
				 
			
 
				 	unsigned src_node = _starpu_select_src_node(handle, memory_node);
			
 
				-	return _starpu_predict_transfer_time(src_node, memory_node, size);
			
 
				+	return starpu_transfer_predict(src_node, memory_node, size);
			
 
				 }
			
 
				 
			
 
				 /* Data transfer performance modeling */
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
@@ -58,10 +58,6 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 void _starpu_create_sampling_directory_if_needed(void);
			
 
				 
			
 
				 void _starpu_load_bus_performance_files(void);
			
 
				-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
			
 
				-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node);
			
 
				-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
			
 
				-
			
 
				 
			
 
				 void _starpu_set_calibrate_flag(unsigned val);
			
 
				 unsigned _starpu_get_calibrate_flag(void);
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1379,25 +1379,8 @@ static void write_bus_bandwidth_file_content(void)
 
				 }
			
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
			
 
				-{
			
 
				-	return bandwidth_matrix[STARPU_MAIN_RAM][cudadev+1];
			
 
				-}
			
 
				-
			
 
				-double starpu_get_latency_RAM_CUDA(unsigned cudadev)
			
 
				-{
			
 
				-	return latency_matrix[STARPU_MAIN_RAM][cudadev+1];
			
 
				-}
			
 
				-
			
 
				-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
			
 
				-{
			
 
				-	return bandwidth_matrix[1][STARPU_MAIN_RAM];
			
 
				-}
			
 
				-
			
 
				-double starpu_get_latency_CUDA_RAM(unsigned cudadev)
			
 
				-{
			
 
				-	return latency_matrix[1][STARPU_MAIN_RAM];
			
 
				-}
			
 
				+double starpu_bus_get_bandwidth(unsigned srcnode, unsigned dstnode);
			
 
				+double starpu_bus_get_latency(unsigned srcnode, unsigned dstnode);
			
 
				 
			
 
				 void starpu_bus_print_bandwidth(FILE *f)
			
 
				 {
			
@@ -1877,19 +1860,19 @@ void _starpu_load_bus_performance_files(void)
 
				 }
			
 
				 
			
 
				 /* (in MB/s) */
			
 
				-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
			
 
				+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
			
 
				 {
			
 
				 	return bandwidth_matrix[src_node][dst_node];
			
 
				 }
			
 
				 
			
 
				 /* (in µs) */
			
 
				-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node)
			
 
				+double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
			
 
				 {
			
 
				 	return latency_matrix[src_node][dst_node];
			
 
				 }
			
 
				 
			
 
				 /* (in µs) */
			
 
				-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
			
 
				+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
			
 
				 {
			
 
				 	double bandwidth = bandwidth_matrix[src_node][dst_node];
			
 
				 	double latency = latency_matrix[src_node][dst_node];
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -62,7 +62,7 @@ unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destinati
 
				 		{
			
 
				 			if (src_node_mask & (1<<i))
			
 
				 			{
			
 
				-				double time = _starpu_predict_transfer_time(i, destination, size);
			
 
				+				double time = starpu_transfer_predict(i, destination, size);
			
 
				 				unsigned handling_node;
			
 
				 
			
 
				 				/* Avoid indirect transfers */
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -1043,7 +1043,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
				 			{
			
 
				 				/* only time can change between disk <-> main_ram 
			
 
				 				 * and not between main_ram <-> worker if we compare diks*/
			
 
				-				double time_tmp = _starpu_predict_transfer_time(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
			
 
				+				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
			
 
				 				if (target == -1 || time_disk > time_tmp)
			
 
				 				{
			
 
				 					target = i;
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -585,7 +585,7 @@ void starpu_bound_print_lp(FILE *output)
 
				 							/* The data transfer from w to w2 only happens if tasks run there */
			
 
				 							fprintf(output, "d_t%luw%ut%luw%u >= %f - 2e5 + 1e5 t%luw%u + 1e5 t%luw%u;\n",
			
 
				 									t1->deps[i].dep->id, w, t1->id, w2,
			
 
				-									_starpu_predict_transfer_time(n, n2, t1->deps[i].size)/1000.,
			
 
				+									starpu_transfer_predict(n, n2, t1->deps[i].size)/1000.,
			
 
				 									t1->deps[i].dep->id, w, t1->id, w2);
			
 
				 						}
			
 
				 					}