|
@@ -64,19 +64,19 @@ static int nopencl = 0;
|
|
|
/* Benchmarking the performance of the bus */
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
-static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
|
|
|
+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
|
|
|
static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
|
static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
|
#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
|
|
|
#endif
|
|
|
-static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
|
+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
-static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
|
|
|
+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
|
|
|
static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
|
|
|
static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
|
|
|
-static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
|
|
|
+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
|
|
|
#endif
|
|
|
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
|
|
@@ -150,7 +150,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
|
gettimeofday(&end, NULL);
|
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
|
|
|
/* Measure download bandwidth */
|
|
|
gettimeofday(&start, NULL);
|
|
@@ -162,7 +162,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
|
gettimeofday(&end, NULL);
|
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
|
|
|
/* Free buffers */
|
|
|
cudaFreeHost(h_buffer);
|
|
@@ -323,7 +323,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
|
gettimeofday(&end, NULL);
|
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
|
|
|
|
|
|
/* Measure download bandwidth */
|
|
|
gettimeofday(&start, NULL);
|
|
@@ -335,7 +335,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
|
|
|
gettimeofday(&end, NULL);
|
|
|
timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
|
|
|
|
|
|
/* Free buffers */
|
|
|
clReleaseMemObject(d_buffer);
|
|
@@ -423,7 +423,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
|
unsigned cpu;
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
|
|
|
|
|
|
#ifdef STARPU_HAVE_HWLOC
|
|
|
int numa_id = 0;
|
|
@@ -437,9 +437,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
|
if (is_available_per_numa_node[numa_id])
|
|
|
{
|
|
|
/* We reuse the previous numbers for that NUMA node */
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
|
|
|
dev_timing_htod_per_numa_node[numa_id];
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
|
|
|
dev_timing_dtoh_per_numa_node[numa_id];
|
|
|
continue;
|
|
|
}
|
|
@@ -460,9 +460,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
|
|
|
{
|
|
|
/* Save the results for that NUMA node */
|
|
|
dev_timing_htod_per_numa_node[numa_id] =
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
|
|
|
dev_timing_dtoh_per_numa_node[numa_id] =
|
|
|
- dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
|
|
|
+ dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
|
|
|
|
|
|
is_available_per_numa_node[numa_id] = 1;
|
|
|
}
|
|
@@ -485,7 +485,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
|
|
|
measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
|
|
|
|
|
|
/* sort the results */
|
|
|
- qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
|
|
|
+ qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
|
|
|
sizeof(struct dev_timing),
|
|
|
compar_dev_timing);
|
|
|
|
|
@@ -493,24 +493,24 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
|
|
|
unsigned cpu;
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
|
|
|
- double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
|
|
|
- double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
|
|
|
+ unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
|
|
|
+ double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
|
|
|
+ double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
|
|
|
|
|
|
double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
|
|
|
|
|
|
_STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
|
|
|
}
|
|
|
|
|
|
- unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
|
|
|
+ unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
|
|
|
|
|
|
_STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
|
|
|
#endif
|
|
|
|
|
|
/* The results are sorted in a decreasing order, so that the best
|
|
|
* measurement is currently the first entry. */
|
|
|
- dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
|
|
|
- dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
|
|
|
+ dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
|
|
|
+ dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
|
|
|
}
|
|
|
#endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
|
|
|
|
|
@@ -727,7 +727,7 @@ static void write_bus_affinity_file_content(void)
|
|
|
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
|
|
|
+ fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
|
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
@@ -740,7 +740,7 @@ static void write_bus_affinity_file_content(void)
|
|
|
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
|
|
|
+ fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
|
|
|
}
|
|
|
|
|
|
fprintf(f, "\n");
|
|
@@ -1108,12 +1108,12 @@ void starpu_bus_print_bandwidth(FILE *f)
|
|
|
maxnode += nopencl;
|
|
|
#endif
|
|
|
|
|
|
- fprintf(f, "from\t");
|
|
|
- fprintf(f, "to RAM\t");
|
|
|
+ fprintf(f, "from to\t");
|
|
|
+ fprintf(f, "RAM\t");
|
|
|
for (dst = 0; dst < ncuda; dst++)
|
|
|
- fprintf(f, "to CUDA %d\t", dst);
|
|
|
+ fprintf(f, "CUDA %d\t", dst);
|
|
|
for (dst = 0; dst < nopencl; dst++)
|
|
|
- fprintf(f, "to OpenCL %d\t", dst);
|
|
|
+ fprintf(f, "OpenCL %d\t", dst);
|
|
|
fprintf(f, "\n");
|
|
|
|
|
|
for (src = 0; src <= maxnode; src++)
|
|
@@ -1143,7 +1143,7 @@ void starpu_bus_print_bandwidth(FILE *f)
|
|
|
fprintf(f, "CUDA %d\t", src-1);
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- timing = &cudadev_timing_per_cpu[src*MAXCPUS+cpu];
|
|
|
+ timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
|
|
|
if (timing->timing_htod)
|
|
|
fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
|
|
|
else
|
|
@@ -1155,7 +1155,7 @@ void starpu_bus_print_bandwidth(FILE *f)
|
|
|
fprintf(f, "OpenCL%d\t", src-ncuda-1);
|
|
|
for (cpu = 0; cpu < ncpus; cpu++)
|
|
|
{
|
|
|
- timing = &opencldev_timing_per_cpu[(src-ncuda)*MAXCPUS+cpu];
|
|
|
+ timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
|
|
|
if (timing->timing_htod)
|
|
|
fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
|
|
|
else
|