|
@@ -86,6 +86,41 @@ static hwloc_topology_t hwtopology;
|
|
|
#endif
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
+
|
|
|
+static void initialize_cuda(unsigned ncuda)
|
|
|
+{
|
|
|
+ unsigned dev;
|
|
|
+
|
|
|
+ for (dev = 0; dev < ncuda; dev++) {
|
|
|
+ cudaError_t cures;
|
|
|
+
|
|
|
+ _STARPU_DISP("Initializing CUDA%d...\n", dev);
|
|
|
+
|
|
|
+ /* Initiliaze CUDA context on the device */
|
|
|
+ cures = cudaSetDevice(dev);
|
|
|
+ if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
+ unsigned dev2;
|
|
|
+ for (dev2 = 0; dev2 < ncuda; dev2++) {
|
|
|
+ if (dev2 != dev) {
|
|
|
+ int can;
|
|
|
+ cures = cudaDeviceCanAccessPeer(&can, dev, dev2);
|
|
|
+ if (!cures && can) {
|
|
|
+ cures = cudaDeviceEnablePeerAccess(dev2, 0);
|
|
|
+ if (!cures)
|
|
|
+ _STARPU_DISP("GPU-Direct %d -> %d\n", dev2, dev);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
+ /* hack to force the initialization */
|
|
|
+ cures = cudaFree(0);
|
|
|
+ if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
|
|
|
{
|
|
|
struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
@@ -98,12 +133,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
|
|
|
/* hack to avoid third party libs to rebind threads */
|
|
|
_starpu_bind_thread_on_cpu(config, cpu);
|
|
|
|
|
|
- /* hack to force the initialization */
|
|
|
- cudaFree(0);
|
|
|
-
|
|
|
- /* hack to avoid third party libs to rebind threads */
|
|
|
- _starpu_bind_thread_on_cpu(config, cpu);
|
|
|
-
|
|
|
/* Get the maximum size which can be allocated on the device */
|
|
|
struct cudaDeviceProp prop;
|
|
|
cudaError_t cures;
|
|
@@ -537,12 +566,7 @@ static void benchmark_all_gpu_devices(void)
|
|
|
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
ncuda = _starpu_get_cuda_device_count();
|
|
|
- for (i = 0; i < ncuda; i++)
|
|
|
- {
|
|
|
- _STARPU_DISP("CUDA %d...\n", i);
|
|
|
- /* measure bandwidth between Host and Device i */
|
|
|
- measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
|
|
|
- }
|
|
|
+ initialize_cuda(ncuda);
|
|
|
#ifdef HAVE_CUDA_MEMCPY_PEER
|
|
|
for (i = 0; i < ncuda; i++)
|
|
|
for (j = 0; j < ncuda; j++)
|
|
@@ -553,6 +577,12 @@ static void benchmark_all_gpu_devices(void)
|
|
|
measure_bandwidth_between_dev_and_dev_cuda(i, j);
|
|
|
}
|
|
|
#endif
|
|
|
+ for (i = 0; i < ncuda; i++)
|
|
|
+ {
|
|
|
+ _STARPU_DISP("CUDA %d...\n", i);
|
|
|
+ /* measure bandwidth between Host and Device i */
|
|
|
+ measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, "CUDA");
|
|
|
+ }
|
|
|
#endif
|
|
|
#ifdef STARPU_USE_OPENCL
|
|
|
nopencl = _starpu_opencl_get_device_count();
|