|
@@ -386,7 +386,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
|
|
|
|
|
|
double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
|
|
|
|
|
|
- _STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
|
|
|
+ _STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
|
|
|
}
|
|
|
|
|
|
unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
|
|
@@ -429,15 +429,12 @@ static void benchmark_all_gpu_devices(void)
|
|
|
#endif
|
|
|
|
|
|
struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
- ncpus = config->topology.ncpus;
|
|
|
-
|
|
|
- /* TODO: measure bandwidth between GPU-GPU */
|
|
|
+ ncpus = _starpu_topology_get_nhwcpu(config);
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- ncuda = _starpu_get_cuda_device_count();
|
|
|
+ cudaGetDeviceCount(&ncuda);
|
|
|
for (i = 0; i < ncuda; i++)
|
|
|
{
|
|
|
- fprintf(stderr," CUDA %d...", i);
|
|
|
/* measure bandwidth between Host and Device i */
|
|
|
measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
|
|
|
}
|
|
@@ -446,7 +443,6 @@ static void benchmark_all_gpu_devices(void)
|
|
|
nopencl = _starpu_opencl_get_device_count();
|
|
|
for (i = 0; i < nopencl; i++)
|
|
|
{
|
|
|
- fprintf(stderr," OpenCL %d...", i);
|
|
|
/* measure bandwith between Host and Device i */
|
|
|
measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
|
|
|
}
|
|
@@ -481,7 +477,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
|
|
|
char hostname[32];
|
|
|
char *forced_hostname = getenv("STARPU_HOSTNAME");
|
|
|
if (forced_hostname && forced_hostname[0])
|
|
|
- snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
|
|
|
+ snprintf(hostname, sizeof(hostname), forced_hostname);
|
|
|
else
|
|
|
gethostname(hostname, sizeof(hostname));
|
|
|
strncat(path, ".", maxlen);
|
|
@@ -509,11 +505,11 @@ static void load_bus_affinity_file_content(void)
|
|
|
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
|
struct starpu_machine_config_s *config = _starpu_get_machine_config();
|
|
|
- ncpus = config->topology.ncpus;
|
|
|
+ ncpus = _starpu_topology_get_nhwcpu(config);
|
|
|
int gpu;
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- ncuda = _starpu_get_cuda_device_count();
|
|
|
+ cudaGetDeviceCount(&ncuda);
|
|
|
for (gpu = 0; gpu < ncuda; gpu++)
|
|
|
{
|
|
|
int ret;
|
|
@@ -764,7 +760,7 @@ static void write_bus_latency_file_content(void)
|
|
|
latency = ((src && dst)?2000.0:500.0);
|
|
|
}
|
|
|
|
|
|
- fprintf(f, "%f\t", latency);
|
|
|
+ fprintf(f, "%lf\t", latency);
|
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
@@ -892,21 +888,23 @@ static void write_bus_bandwidth_file_content(void)
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
|
else if (src != dst)
|
|
|
{
|
|
|
- double slowness_src_to_ram=0.0, slowness_ram_to_dst=0.0;
|
|
|
- /* Total bandwidth is the harmonic mean of bandwidths */
|
|
|
+ double time_src_to_ram=0.0, time_ram_to_dst=0.0;
|
|
|
+ double timing;
|
|
|
+ /* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- if (src && src <= ncuda)
|
|
|
- slowness_src_to_ram = cudadev_timing_dtoh[src]/cuda_size;
|
|
|
- if (dst && dst <= ncuda)
|
|
|
- slowness_ram_to_dst = cudadev_timing_htod[dst]/cuda_size;
|
|
|
+ time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
|
|
|
+ time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
|
|
|
+ timing =time_src_to_ram + time_ram_to_dst;
|
|
|
+ bandwidth = 1.0*cuda_size/timing;
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
- if (src > ncuda)
|
|
|
- slowness_src_to_ram = opencldev_timing_dtoh[src-ncuda]/opencl_size;
|
|
|
- if (dst > ncuda)
|
|
|
- slowness_ram_to_dst = opencldev_timing_htod[dst-ncuda]/opencl_size;
|
|
|
+ if (src > ncuda)
|
|
|
+ time_src_to_ram = (src==0)?0.0:opencldev_timing_dtoh[src-ncuda];
|
|
|
+ if (dst > ncuda)
|
|
|
+ time_ram_to_dst = (dst==0)?0.0:opencldev_timing_htod[dst-ncuda];
|
|
|
+ timing =time_src_to_ram + time_ram_to_dst;
|
|
|
+ bandwidth = 1.0*opencl_size/timing;
|
|
|
#endif
|
|
|
- bandwidth = 1.0/(slowness_src_to_ram + slowness_ram_to_dst);
|
|
|
}
|
|
|
#endif
|
|
|
else {
|
|
@@ -914,7 +912,7 @@ static void write_bus_bandwidth_file_content(void)
|
|
|
bandwidth = 0.0;
|
|
|
}
|
|
|
|
|
|
- fprintf(f, "%f\t", bandwidth);
|
|
|
+ fprintf(f, "%lf\t", bandwidth);
|
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
@@ -923,38 +921,6 @@ static void write_bus_bandwidth_file_content(void)
|
|
|
fclose(f);
|
|
|
}
|
|
|
|
|
|
-void starpu_print_bus_bandwidth(FILE *f)
|
|
|
-{
|
|
|
- int src, dst, maxnode;
|
|
|
-
|
|
|
- maxnode = ncuda;
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
- maxnode += nopencl;
|
|
|
-#endif
|
|
|
-
|
|
|
- fprintf(f, "from\t");
|
|
|
- fprintf(f, "to RAM\t\t");
|
|
|
- for (dst = 0; dst < ncuda; dst++)
|
|
|
- fprintf(f, "to CUDA %d\t", dst);
|
|
|
- for (dst = 0; dst < nopencl; dst++)
|
|
|
- fprintf(f, "to OpenCL %d\t", dst);
|
|
|
- fprintf(f, "\n");
|
|
|
-
|
|
|
- for (src = 0; src <= maxnode; src++)
|
|
|
- {
|
|
|
- if (!src)
|
|
|
- fprintf(f, "RAM\t");
|
|
|
- else if (src <= ncuda)
|
|
|
- fprintf(f, "CUDA %d\t", src-1);
|
|
|
- else
|
|
|
- fprintf(f, "OpenCL%d\t", src-ncuda-1);
|
|
|
- for (dst = 0; dst <= maxnode; dst++)
|
|
|
- fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
|
|
|
-
|
|
|
- fprintf(f, "\n");
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
static void generate_bus_bandwidth_file(void)
|
|
|
{
|
|
|
if (!was_benchmarked)
|
|
@@ -1020,9 +986,9 @@ static void check_bus_config_file()
|
|
|
fclose(f);
|
|
|
|
|
|
// Loading current configuration
|
|
|
- ncpus = config->topology.ncpus;
|
|
|
+ ncpus = _starpu_topology_get_nhwcpu(config);
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
- ncuda = _starpu_get_cuda_device_count();
|
|
|
+ cudaGetDeviceCount(&ncuda);
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
nopencl = _starpu_opencl_get_device_count();
|