13 years ago · 6b446a9ab8
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -342,6 +342,10 @@ The AMD implementation of OpenCL is known to
 
																 fail when copying data asynchronously. When using this implementation,
															
 
																 it is therefore necessary to disable asynchronous data transfers.
															
 
																+@item @code{STARPU_DISABLE_CUDA_GPU_GPU_DIRECT}
															
 
																+Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
															
 
																+instead. This permits to test the performance effect of GPU-Direct.
															
 
																+
															
 
																 @end table
															
 
																 @node Scheduling
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -191,11 +191,13 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Initialize CUDA context on the source */
															
 
																 	cudaSetDevice(src);
															
 
																-	cures = cudaDeviceCanAccessPeer(&can, src, dst);
															
 
																-	if (!cures && can) {
															
 
																-		cures = cudaDeviceEnablePeerAccess(dst, 0);
															
 
																-		if (!cures)
															
 
																-			_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
															
 
																+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
															
 
																+		cures = cudaDeviceCanAccessPeer(&can, src, dst);
															
 
																+		if (!cures && can) {
															
 
																+			cures = cudaDeviceEnablePeerAccess(dst, 0);
															
 
																+			if (!cures)
															
 
																+				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
															
 
																+		}
															
 
																 	}
															
 
																 	/* Allocate a buffer on the device */
															
@@ -207,11 +209,13 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Initialize CUDA context on the destination */
															
 
																 	cudaSetDevice(dst);
															
 
																-	cures = cudaDeviceCanAccessPeer(&can, dst, src);
															
 
																-	if (!cures && can) {
															
 
																-		cures = cudaDeviceEnablePeerAccess(src, 0);
															
 
																-		if (!cures)
															
 
																-			_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
															
 
																+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
															
 
																+		cures = cudaDeviceCanAccessPeer(&can, dst, src);
															
 
																+		if (!cures && can) {
															
 
																+			cures = cudaDeviceEnablePeerAccess(src, 0);
															
 
																+			if (!cures)
															
 
																+				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
															
 
																+		}
															
 
																 	}
															
 
																 	/* Allocate a buffer on the device */
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -156,16 +156,18 @@ static void init_context(int devid)
 
																 	starpu_cuda_set_device(devid);
															
 
																 #ifdef HAVE_CUDA_MEMCPY_PEER
															
 
																-	int nworkers = starpu_worker_get_count();
															
 
																-	for (workerid = 0; workerid < nworkers; workerid++) {
															
 
																-		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																-		if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
															
 
																-			int can;
															
 
																-			cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
															
 
																-			if (!cures && can) {
															
 
																-				cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
															
 
																-				if (!cures)
															
 
																-					_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
															
 
																+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
															
 
																+		int nworkers = starpu_worker_get_count();
															
 
																+		for (workerid = 0; workerid < nworkers; workerid++) {
															
 
																+			struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+			if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
															
 
																+				int can;
															
 
																+				cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
															
 
																+				if (!cures && can) {
															
 
																+					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
															
 
																+					if (!cures)
															
 
																+						_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
															
 
																+				}
															
 
																 			}
															
 
																 		}
															
 
																 	}