лет назад: 13 · 6b446a9ab8
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -342,6 +342,10 @@ The AMD implementation of OpenCL is known to
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				 
			
 
				+@item @code{STARPU_DISABLE_CUDA_GPU_GPU_DIRECT}
			
 
				+Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
			
 
				+instead. This permits to test the performance effect of GPU-Direct.
			
 
				+
			
 
				 @end table
			
 
				 
			
 
				 @node Scheduling
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -191,11 +191,13 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Initialize CUDA context on the source */
			
 
				 	cudaSetDevice(src);
			
 
				 
			
 
				-	cures = cudaDeviceCanAccessPeer(&can, src, dst);
			
 
				-	if (!cures && can) {
			
 
				-		cures = cudaDeviceEnablePeerAccess(dst, 0);
			
 
				-		if (!cures)
			
 
				-			_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
			
 
				+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
			
 
				+		cures = cudaDeviceCanAccessPeer(&can, src, dst);
			
 
				+		if (!cures && can) {
			
 
				+			cures = cudaDeviceEnablePeerAccess(dst, 0);
			
 
				+			if (!cures)
			
 
				+				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
@@ -207,11 +209,13 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Initialize CUDA context on the destination */
			
 
				 	cudaSetDevice(dst);
			
 
				 
			
 
				-	cures = cudaDeviceCanAccessPeer(&can, dst, src);
			
 
				-	if (!cures && can) {
			
 
				-		cures = cudaDeviceEnablePeerAccess(src, 0);
			
 
				-		if (!cures)
			
 
				-			_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
			
 
				+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
			
 
				+		cures = cudaDeviceCanAccessPeer(&can, dst, src);
			
 
				+		if (!cures && can) {
			
 
				+			cures = cudaDeviceEnablePeerAccess(src, 0);
			
 
				+			if (!cures)
			
 
				+				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -156,16 +156,18 @@ static void init_context(int devid)
 
				 	starpu_cuda_set_device(devid);
			
 
				 
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-	int nworkers = starpu_worker_get_count();
			
 
				-	for (workerid = 0; workerid < nworkers; workerid++) {
			
 
				-		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				-		if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
			
 
				-			int can;
			
 
				-			cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
			
 
				-			if (!cures && can) {
			
 
				-				cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				-				if (!cures)
			
 
				-					_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				+	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") > 0) {
			
 
				+		int nworkers = starpu_worker_get_count();
			
 
				+		for (workerid = 0; workerid < nworkers; workerid++) {
			
 
				+			struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+			if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
			
 
				+				int can;
			
 
				+				cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
			
 
				+				if (!cures && can) {
			
 
				+					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				+					if (!cures)
			
 
				+						_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 	}