浏览代码

replace starpu_get_bandwidth/latency_CUDA/RAM_RAM/CUDA with calls to starpu_transfer_*

Samuel Thibault 12 年之前
父节点
当前提交
9c038ac2be

+ 6 - 10
doc/doxygen/chapters/api/performance_model.doxy

@@ -260,20 +260,16 @@ existing set of measurements done in good conditions, that StarPU
 could benefit from instead of doing on-line measurements. And example
 of use can be seen in \ref PerformanceModelExample.
 
-\fn double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the bandwidth of data transfer between two memory nodes
 
-\fn double starpu_get_latency_RAM_CUDA(unsigned cudadev)
+\fn double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the latency of data transfer between two memory nodes
 
-\fn double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
-\ingroup API_Performance_Mode
-Used to compute the execution time of tasks
-
-\fn double starpu_get_latency_CUDA_RAM(unsigned cudadev)
+\fn double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
 \ingroup API_Performance_Model
-Used to compute the execution time of tasks
+Return the estimated time to transfer a given size between two memory nodes.
 
 */

+ 3 - 5
include/starpu_perfmodel.h

@@ -169,11 +169,9 @@ void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct star
 void starpu_bus_print_bandwidth(FILE *f);
 void starpu_bus_print_affinity(FILE *f);
 
-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev);
-double starpu_get_latency_RAM_CUDA(unsigned cudadev);
-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev);
-double starpu_get_latency_CUDA_RAM(unsigned cudadev);
-
+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
+double starpu_transfer_latency(unsigned src_node, unsigned dst_node);
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size);
 
 #ifdef __cplusplus
 }

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/ispeed_lp_policy.c

@@ -284,7 +284,7 @@ static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_i
 					unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
 					if(!worker_in_ctx)
 					{
-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)) / 1000;
 						speed[s][w] = (speed[s][w] * transfer_speed) / (speed[s][w] + transfer_speed);
 					}
 				}

+ 4 - 4
sc_hypervisor/src/policies_utils/policy_tools.c

@@ -430,9 +430,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 				{
 					if(arch == STARPU_CUDA_WORKER)
 					{
-						double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker);
+						double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
 						transfer_time +=  (tp->data_size / transfer_speed) / 1000. ;
-						double latency = starpu_get_latency_RAM_CUDA(worker);
+						double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker));
 						transfer_time += latency/1000.;
 						
 						
@@ -441,9 +441,9 @@ void sc_hypervisor_get_tasks_times(int nw, int nt, double times[nw][nt], int *wo
 					{
 						if(!starpu_sched_ctx_contains_type_of_worker(arch, tp->sched_ctx_id))
 						{
-							double transfer_speed = starpu_get_bandwidth_CUDA_RAM(worker);
+							double transfer_speed = starpu_transfer_bandwidth(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
 							transfer_time += (tp->data_size / transfer_speed) / 1000. ;
-							double latency = starpu_get_latency_CUDA_RAM(worker);
+							double latency = starpu_transfer_latency(starpu_worker_get_memory_node(worker), STARPU_MAIN_RAM);
 							transfer_time += latency / 1000.;
 						}
 					}

+ 2 - 2
sc_hypervisor/src/policies_utils/speed.c

@@ -74,10 +74,10 @@ double sc_hypervisor_get_speed_per_worker(struct sc_hypervisor_wrapper *sc_w, un
 /* /\* 			if(!worker_in_ctx) *\/ */
 /* /\* 			{ *\/ */
 
-/* /\* 				double transfer_speed = starpu_get_bandwidth_RAM_CUDA(worker); *\/ */
+/* /\* 				double transfer_speed = starpu_transfer_bandwidth(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); *\/ */
 /* /\* 				elapsed_time +=  (elapsed_data_used / transfer_speed) / 1000000 ; *\/ */
 /* /\* 			} *\/ */
-/* 			double latency = starpu_get_latency_RAM_CUDA(worker); */
+/* 			double latency = starpu_transfer_latency(STARPU_MAIN_RAM, starpu_worker_get_memory_node(worker)); */
 /* //			printf("%d/%d: latency %lf elapsed_time before %lf ntasks %d\n", worker, sc_w->sched_ctx, latency, elapsed_time, elapsed_tasks); */
 /* 			elapsed_time += (elapsed_tasks * latency)/1000000; */
 /* //			printf("elapsed time after %lf \n", elapsed_time); */

+ 1 - 1
src/core/perfmodel/perfmodel.c

@@ -274,7 +274,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 		return 0.0;
 
 	unsigned src_node = _starpu_select_src_node(handle, memory_node);
-	return _starpu_predict_transfer_time(src_node, memory_node, size);
+	return starpu_transfer_predict(src_node, memory_node, size);
 }
 
 /* Data transfer performance modeling */

+ 1 - 5
src/core/perfmodel/perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -58,10 +58,6 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 void _starpu_create_sampling_directory_if_needed(void);
 
 void _starpu_load_bus_performance_files(void);
-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node);
-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node);
-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
-
 
 void _starpu_set_calibrate_flag(unsigned val);
 unsigned _starpu_get_calibrate_flag(void);

+ 5 - 22
src/core/perfmodel/perfmodel_bus.c

@@ -1379,25 +1379,8 @@ static void write_bus_bandwidth_file_content(void)
 }
 #endif /* STARPU_SIMGRID */
 
-double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
-{
-	return bandwidth_matrix[STARPU_MAIN_RAM][cudadev+1];
-}
-
-double starpu_get_latency_RAM_CUDA(unsigned cudadev)
-{
-	return latency_matrix[STARPU_MAIN_RAM][cudadev+1];
-}
-
-double starpu_get_bandwidth_CUDA_RAM(unsigned cudadev)
-{
-	return bandwidth_matrix[1][STARPU_MAIN_RAM];
-}
-
-double starpu_get_latency_CUDA_RAM(unsigned cudadev)
-{
-	return latency_matrix[1][STARPU_MAIN_RAM];
-}
+double starpu_bus_get_bandwidth(unsigned srcnode, unsigned dstnode);
+double starpu_bus_get_latency(unsigned srcnode, unsigned dstnode);
 
 void starpu_bus_print_bandwidth(FILE *f)
 {
@@ -1877,19 +1860,19 @@ void _starpu_load_bus_performance_files(void)
 }
 
 /* (in MB/s) */
-double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
+double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
 {
 	return bandwidth_matrix[src_node][dst_node];
 }
 
 /* (in µs) */
-double _starpu_transfer_latency(unsigned src_node, unsigned dst_node)
+double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
 {
 	return latency_matrix[src_node][dst_node];
 }
 
 /* (in µs) */
-double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
+double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
 {
 	double bandwidth = bandwidth_matrix[src_node][dst_node];
 	double latency = latency_matrix[src_node][dst_node];

+ 1 - 1
src/datawizard/coherency.c

@@ -62,7 +62,7 @@ unsigned _starpu_select_src_node(starpu_data_handle_t handle, unsigned destinati
 		{
 			if (src_node_mask & (1<<i))
 			{
-				double time = _starpu_predict_transfer_time(i, destination, size);
+				double time = starpu_transfer_predict(i, destination, size);
 				unsigned handling_node;
 
 				/* Avoid indirect transfers */

+ 1 - 1
src/datawizard/memalloc.c

@@ -1043,7 +1043,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 			{
 				/* only time can change between disk <-> main_ram 
 				 * and not between main_ram <-> worker if we compare diks*/
-				double time_tmp = _starpu_predict_transfer_time(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
+				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
 				if (target == -1 || time_disk > time_tmp)
 				{
 					target = i;

+ 1 - 1
src/profiling/bound.c

@@ -585,7 +585,7 @@ void starpu_bound_print_lp(FILE *output)
 							/* The data transfer from w to w2 only happens if tasks run there */
 							fprintf(output, "d_t%luw%ut%luw%u >= %f - 2e5 + 1e5 t%luw%u + 1e5 t%luw%u;\n",
 									t1->deps[i].dep->id, w, t1->id, w2,
-									_starpu_predict_transfer_time(n, n2, t1->deps[i].size)/1000.,
+									starpu_transfer_predict(n, n2, t1->deps[i].size)/1000.,
 									t1->deps[i].dep->id, w, t1->id, w2);
 						}
 					}