Browse Source

Start to add support for CUDA 4:
- GPU-GPU transfers are enabled when applicable (not implemented in all data
interfaces yet!).
- A non-CUDA worker can initiate a transfer/allocation on a CUDA device.

Still TODO:
- Implement the cuda_to_cuda_async method for all interfaces (especially matrix)
- Benchmark the actual GPU-GPU transfers for the performance models

Cédric Augonnet 14 years ago
parent
commit
e51515a4dd

+ 12 - 0
configure.ac

@@ -473,6 +473,18 @@ if test x$have_curand = xyes; then
     STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcurand"
 fi
 
+# Peer transfers are only supported since CUDA 4.0
+have_cuda_memcpy_peer=no
+if test x$enable_cuda = xyes; then
+    SAVED_LDFLAGS="${LDFLAGS}"
+    LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
+    AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
+    LDFLAGS="${SAVED_LDFLAGS}"
+fi
+if test x$have_cuda_memcpy_peer=yes; then
+    AC_DEFINE(HAVE_CUDA_MEMCPY_PEER,[],[Peer transfers are supported in CUDA])
+fi
+
 if test x$enable_cuda = xyes; then
     STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lstdc++"
     AC_SUBST(STARPU_CUDA_LDFLAGS)

+ 52 - 11
src/datawizard/coherency.c

@@ -107,19 +107,58 @@ void _starpu_update_data_state(starpu_data_handle handle,
 	}
 }
 
+static int worker_supports_direct_access(unsigned node)
+{
+	int type = _starpu_get_node_kind(node);
+	switch (type)
+	{
+		case STARPU_CUDA_RAM:
+#ifdef HAVE_CUDA_MEMCPY_PEER
+			/* GPUs not always allow direct remote access: if CUDA4
+			 * is enabled, we allow two CUDA devices to communicate. */
+			return 1;
+#else
+			/* Direct GPU-GPU transfers are not allowed in general */
+			return 0;
+#endif
+		case STARPU_OPENCL_RAM:
+			return 0;
+		default:
+			return 1;
+	}
+}
+
+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node)
+{
+	/* NB: when OpenCL and CUDA support peer transfers, we'll need to apply
+	 * a little more checking here! */
+
+	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
+	 * Perhaps not all data interface provide a direct GPU-GPU transfer
+	 * method ! */
+	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+	{
+		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+		return (!!copy_methods->cuda_to_cuda_async);
+	}
+
+	return (worker_supports_direct_access(src_node)
+			&& worker_supports_direct_access(dst_node));
+}
+
 /* Determines the path of a request : each hop is defined by (src,dst) and the
  * node that handles the hop. The returned value indicates the number of hops,
  * and the max_len is the maximum number of hops (ie. the size of the
  * src_nodes, dst_nodes and handling_nodes arrays. */
-static int determine_request_path(unsigned src_node, unsigned dst_node,
+static int determine_request_path(starpu_data_handle handle,
+				unsigned src_node, unsigned dst_node,
 				starpu_access_mode mode, int max_len,
 				unsigned *src_nodes, unsigned *dst_nodes,
 				unsigned *handling_nodes)
 {
-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
+	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node);
 
-	if ((mode & STARPU_R) && (src_is_a_gpu && dst_is_a_gpu)) {
+	if (!link_is_valid && (mode & STARPU_R)) {
 		/* We need an intermediate hop to implement data staging
 		 * through main memory. */
 		STARPU_ASSERT(max_len >= 2);
@@ -139,15 +178,17 @@ static int determine_request_path(unsigned src_node, unsigned dst_node,
 		return 2;
 	}
 	else {
+		unsigned handling_node;
+		int src_supports_peer;
+
 		STARPU_ASSERT(max_len >= 1);
 		
-		unsigned handling_node;
+		/* If we do have to perform a transfer (mode & STARPU_R), but
+		 * the source does not support peer access, then we need to
+		 * rely on the source to perform the transfer. */
+		src_supports_peer = worker_supports_direct_access(src_node);
+		handling_node = ((mode & STARPU_R) && !src_supports_peer)?src_node:dst_node;
 
-		/* The handling node is the GPU (if applicable), otherwise it's
-		 * the destination node. If both src and dst are GPUs, we
-		 * ensured that !(mode & STARPU_R), so we only need to allocate
-		 * the data from the destination */
-		handling_node = (!src_is_a_gpu && dst_is_a_gpu)?dst_node:src_node;
 		src_nodes[0] = src_node;
 		dst_nodes[0] = dst_node;
 		handling_nodes[0] = handling_node;
@@ -251,7 +292,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	/* We can safely assume that there won't be more than 2 hops in the
 	 * current implementation */
 	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
-	int nhops = determine_request_path(src_node, requesting_node, mode, 4,
+	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
 					src_nodes, dst_nodes, handling_nodes);
 	STARPU_ASSERT(nhops <= 4);
 

+ 62 - 38
src/datawizard/copy_driver.c

@@ -108,6 +108,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
+	{
+		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
+		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
+		STARPU_ASSERT(cures == cudaSuccess);
+	}
+#endif
+
 	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
@@ -116,29 +125,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 		break;
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		/* CUBLAS_RAM -> STARPU_CPU_RAM */
-		/* only the proper CUBLAS thread can initiate this ! */
-		if (_starpu_get_local_memory_node() == src_node) {
-			/* only the proper CUBLAS thread can initiate this directly ! */
-			STARPU_ASSERT(copy_methods->cuda_to_ram);
-			if (!req || !copy_methods->cuda_to_ram_async) {
-				/* this is not associated to a request so it's synchronous */
-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
-			}
-			else {
-				cures = cudaEventCreate(&req->async_channel.cuda_event);
-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-				stream = starpu_cuda_get_local_transfer_stream();
-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
-
-				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-			}
+		/* only the proper CUBLAS thread can initiate this directly ! */
+		STARPU_ASSERT(copy_methods->cuda_to_ram);
+		if (!req || !copy_methods->cuda_to_ram_async) {
+			/* this is not associated to a request so it's synchronous */
+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			/* we should not have a blocking call ! */
-			STARPU_ABORT();
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+			stream = starpu_cuda_get_local_transfer_stream();
+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
@@ -151,13 +153,35 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			cures = cudaEventCreate(&req->async_channel.cuda_event);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
-			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+			if (STARPU_UNLIKELY(cures != cudaSuccess))
+				STARPU_CUDA_REPORT_ERROR(cures);
+		}
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
+		/* CUDA - CUDA transfer */
+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
+		if (!req || !copy_methods->cuda_to_cuda_async) {
+			/* this is not associated to a request so it's synchronous */
+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		}
+		else {
+			req->async_channel.type = STARPU_CUDA_RAM;
+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+			stream = starpu_cuda_get_local_stream();
+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+
+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 		}
 		break;
@@ -172,7 +196,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
 			}
 			else {
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
+				req->async_channel.type = STARPU_OPENCL_RAM;
+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 			}
 		}
 		else {
@@ -189,7 +214,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
 		}
 		else {
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
+			req->async_channel.type = STARPU_OPENCL_RAM;
+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
 		}
 		break;
 #endif
@@ -263,10 +289,9 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
-					unsigned handling_node)
+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
+	starpu_node_kind kind = async_channel->type;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
 	cudaError_t cures;
@@ -275,7 +300,7 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 	switch (kind) {
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			event = (*async_channel).cuda_event;
+			event = (*async_channel).event.cuda_event;
 
 			cures = cudaEventSynchronize(event);
 			if (STARPU_UNLIKELY(cures))
@@ -290,10 +315,10 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 #ifdef STARPU_USE_OPENCL
       case STARPU_OPENCL_RAM:
          {
-                 if ((*async_channel).opencl_event == NULL) STARPU_ABORT();
-                 cl_int err = clWaitForEvents(1, &((*async_channel).opencl_event));
+                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
+                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
                  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-                 clReleaseEvent((*async_channel).opencl_event);
+                 clReleaseEvent((*async_channel).event.opencl_event);
          }
          break;
 #endif
@@ -303,10 +328,9 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 	}
 }
 
-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
-					unsigned handling_node)
+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
 {
-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
+	starpu_node_kind kind = async_channel->type;
 	unsigned success;
 #ifdef STARPU_USE_CUDA
 	cudaEvent_t event;
@@ -315,7 +339,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 	switch (kind) {
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			event = (*async_channel).cuda_event;
+			event = (*async_channel).event.cuda_event;
 
 			success = (cudaEventQuery(event) == cudaSuccess);
 			if (success)
@@ -327,7 +351,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
       case STARPU_OPENCL_RAM:
          {
             cl_int event_status;
-            cl_event opencl_event = (*async_channel).opencl_event;
+            cl_event opencl_event = (*async_channel).event.opencl_event;
             if (opencl_event == NULL) STARPU_ABORT();
             cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
             if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 8 - 3
src/datawizard/copy_driver.h

@@ -46,7 +46,12 @@ typedef union {
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
 #endif
-} starpu_async_channel;
+} starpu_async_channel_event;
+
+struct starpu_async_channel {
+	starpu_async_channel_event event;
+	starpu_node_kind type;
+};
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
 
@@ -57,6 +62,6 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
 					struct starpu_data_request_s *req,
 					unsigned may_alloc);
 
-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
 #endif // __COPY_DRIVER_H__

+ 2 - 2
src/datawizard/data_request.c

@@ -412,11 +412,11 @@ static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force
 		/* wait until the transfer is terminated */
 		if (force)
 		{
-			_starpu_driver_wait_request_completion(&r->async_channel, src_node);
+			_starpu_driver_wait_request_completion(&r->async_channel);
 			starpu_handle_data_request_completion(r);
 		}
 		else {
-			if (_starpu_driver_test_request_completion(&r->async_channel, src_node))
+			if (_starpu_driver_test_request_completion(&r->async_channel))
 			{
 				/* The request was completed */
 				starpu_handle_data_request_completion(r);

+ 1 - 1
src/datawizard/data_request.h

@@ -44,7 +44,7 @@ LIST_TYPE(starpu_data_request,
 
 	starpu_access_mode mode;
 
-	starpu_async_channel async_channel;
+	struct starpu_async_channel async_channel;
 
 	unsigned completed;
 	int retval;

+ 199 - 10
src/datawizard/interfaces/csr_interface.c

@@ -28,15 +28,18 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 #endif
 
 static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
@@ -46,6 +49,9 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -408,20 +414,203 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind, cudaStream_t stream)
+{
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cudaError_t cures;
+
+	int synchronous_fallback = 0;
+
+	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
+	if (cures)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	
+	if (synchronous_fallback)
+	{
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		return 0;
+	}
+	else {
+		return -EAGAIN;
+	}
+}
+
+static int copy_cuda_peer(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	cudaError_t cures;
+
+	cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
+#else
+	STARPU_ABORT();
+	return 0;
+#endif
+}
+
+static int copy_cuda_peer_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+{
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	starpu_csr_interface_t *src_csr = src_interface;
+	starpu_csr_interface_t *dst_csr = dst_interface;
+
+	uint32_t nnz = src_csr->nnz;
+	uint32_t nrow = src_csr->nrow;
+	size_t elemsize = src_csr->elemsize;
+
+	cudaError_t cures;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	int synchronous_fallback = 0;
+
+	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
+	if (cures)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+
+	if (!synchronous_fallback)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
+	}
+
+	if (synchronous_fallback || cures != cudaSuccess)
+	{
+		synchronous_fallback = 1;
+		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	
+	if (synchronous_fallback)
+	{
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+		return 0;
+	}
+	else {
+		return -EAGAIN;
+	}
+#else
+	/* Illegal without Peer tranfers */
+	STARPU_ABORT();
+	return 0;
+#endif
+}
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
 }
 
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	else
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
+}
+
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
+}
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+		return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
+	else
+		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
 }
+
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 133 - 55
src/datawizard/interfaces/matrix_interface.c

@@ -32,6 +32,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -49,6 +50,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -350,17 +352,48 @@ static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
-
 	cudaError_t cures;
-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
+
+#if 0
+
+	struct cudaMemcpy3DParms p;
+	memset(&p, 0, sizeof(p));
+
+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
+	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+	p.kind = kind;
+
+	if (is_async)
+	{
+		cures = cudaMemcpy3DAsync(&p, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpy3D(&p);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+#endif
+
+	if (is_async)
+	{
+		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-			src_matrix->nx*elemsize, src_matrix->ny, kind);
+			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
+		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
+		src_matrix->nx*elemsize, src_matrix->ny, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -369,81 +402,126 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+/* XXX this is broken : we need to find a way to fix that ! */
+#if 0
+static int copy_cuda_peer(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), int is_async, cudaStream_t stream)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
 
 	size_t elemsize = src_matrix->elemsize;
+	cudaError_t cures;
 
-	cudaError_t cures;	
-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
-			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
-			cudaMemcpyDeviceToHost, stream);
-	if (cures)
-	{
-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
-			(size_t)src_matrix->nx*elemsize, (size_t)src_matrix->ny,
-			cudaMemcpyDeviceToHost);
+#if 1
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
 
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
+	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
 
-		return 0;
+	cures = cudaSetDevice(src_dev);
+	STARPU_ASSERT(cures == cudaSuccess);
+
+	struct cudaPitchedPtr mem_device1;
+	cures = cudaMalloc3D(&mem_device1, extent);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaSetDevice(dst_dev);
+	STARPU_ASSERT(cures == cudaSuccess);
+
+	struct cudaPitchedPtr mem_device2;
+	cures = cudaMalloc3D(&mem_device2, extent);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	struct cudaMemcpy3DPeerParms p;
+	memset(&p, 0, sizeof(p));
+	p.srcDevice = src_dev;
+	p.dstDevice = dst_dev;
+	p.srcPtr = mem_device1;
+	p.dstPtr = mem_device2;
+	p.extent = extent;
+
+	cures = cudaMemcpy3DPeer(&p);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+
+//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
+//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
+//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
+
+//	if (is_async)
+//	{
+//		cures = cudaMemcpy3DPeerAsync(&p, stream);
+//		if (!cures)
+//			return -EAGAIN;
+//	}
+
+#else
+	/* XXX FIXME !!*/
+	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
+	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
+
+	if (is_async)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
+		if (!cures)
+			return -EAGAIN;
 	}
 
+	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+#endif
+
 	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
 
-	return -EAGAIN;
+	return 0;
 }
+#endif
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
+}
 
-	size_t elemsize = src_matrix->elemsize;
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
+}
 
-	cudaError_t cures;
-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-				src_matrix->nx*elemsize, src_matrix->ny,
-				cudaMemcpyHostToDevice, stream);
-	if (cures)
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
+	else
 	{
-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-				src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
+		/* XXX not implemented */
+		STARPU_ABORT();
 		return 0;
 	}
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
+}
 
-	return -EAGAIN;
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
 }
 
+#if 0
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
+{
+	if (src_node == dst_node)
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
+	else
+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
+}
+#endif
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 69 - 2
src/datawizard/interfaces/variable_interface.c

@@ -31,6 +31,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_in
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -47,9 +48,10 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -300,7 +302,31 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
 				void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		int src_dev = starpu_memory_node_to_devid(src_node);
+		int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+		starpu_variable_interface_t *src_variable = src_interface;
+		starpu_variable_interface_t *dst_variable = dst_interface;
+
+		cudaError_t cures;
+		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
+		if (STARPU_UNLIKELY(cures))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+
+#else
+		/* This is illegal without support for cudaMemcpyPeer */
+		STARPU_ABORT();
+#endif
+		return 0;
+	}
 }
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
@@ -340,6 +366,47 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 {
 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
+
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		int src_dev = starpu_memory_node_to_devid(src_node);
+		int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+		starpu_variable_interface_t *src_variable = src_interface;
+		starpu_variable_interface_t *dst_variable = dst_interface;
+
+		size_t length = src_variable->elemsize;
+
+		cudaError_t cures;
+		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
+		if (cures)
+		{
+			/* sychronous fallback */
+			cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length);
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+
+		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+
+		return -EAGAIN;
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
+}
+
+
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 73 - 2
src/datawizard/interfaces/vector_interface.c

@@ -32,6 +32,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
@@ -50,6 +51,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
 	.cuda_to_cuda = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -282,6 +284,10 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 {
 	starpu_vector_interface_t *vector_interface = data_interface;
 
+#ifdef STARPU_USE_CUDA
+	cudaError_t cures;
+#endif
+
 	starpu_node_kind kind = _starpu_get_node_kind(node);
 	switch(kind) {
 		case STARPU_CPU_RAM:
@@ -289,7 +295,8 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 			break;
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
-			cudaFree((void*)vector_interface->ptr);
+			cures = cudaFree((void*)vector_interface->ptr);
+			STARPU_ASSERT(cures == cudaSuccess);
 			break;
 #endif
 #ifdef STARPU_USE_OPENCL
@@ -310,6 +317,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
+
 	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
@@ -319,6 +327,40 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
+#ifdef HAVE_CUDA_MEMCPY_PEER
+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
+				void *dst_interface, unsigned dst_node,
+				int is_async, cudaStream_t stream)
+{
+	cudaError_t cures;
+
+	starpu_vector_interface_t *src_vector = src_interface;
+	starpu_vector_interface_t *dst_vector = dst_interface;
+
+	size_t length = src_vector->nx*src_vector->elemsize;
+
+	int src_dev = starpu_memory_node_to_devid(src_node);
+	int dst_dev = starpu_memory_node_to_devid(dst_node);
+
+	if (is_async)
+	{
+		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
+						(char *)src_vector->ptr, src_dev,
+						length, stream);
+		if (!cures)
+			return -EAGAIN;
+	}
+
+	cures = cudaMemcpyPeer((char *)dst_vector->ptr, dst_dev,
+				(char *)src_vector->ptr, src_dev, length);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
+
+	return 0;
+}
+#endif
 
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
 				void *dst_interface, unsigned dst_node __attribute__((unused)))
@@ -335,7 +377,19 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
 				void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	if (src_node == dst_node)
+	{
+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
 }
 
 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
@@ -346,6 +400,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
+
 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
 	if (cures)
 	{
@@ -362,6 +417,22 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 	return -EAGAIN;
 }
 
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
+{
+	if (src_node == dst_node)
+	{
+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
+	}
+	else {
+#ifdef HAVE_CUDA_MEMCPY_PEER
+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
+#else
+		/* This is illegal without cudaMemcpyPeer */
+		STARPU_ABORT();
+		return 0;
+#endif
+	}
+}
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)

+ 2 - 0
src/datawizard/interfaces/void_interface.c

@@ -39,8 +39,10 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = dummy_copy,
 	.cuda_to_ram = dummy_copy,
+	.cuda_to_cuda = dummy_copy,
 	.ram_to_cuda_async = dummy_cuda_copy_async,
 	.cuda_to_ram_async = dummy_cuda_copy_async,
+	.cuda_to_cuda_async = dummy_cuda_copy_async,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = dummy_copy,

+ 25 - 0
src/datawizard/memalloc.c

@@ -210,6 +210,18 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 		if (handle && !data_was_deleted)
 			STARPU_ASSERT(replicate->allocated);
 
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
+		{
+			/* To facilitate the design of interface, we set the
+			 * proper CUDA device in case it is needed. This avoids
+			 * having to set it again in the free method of each
+			 * interface. */
+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
+			STARPU_ASSERT(err == cudaSuccess);
+		}
+#endif
+
 		mc->ops->free_data_on_node(mc->chunk_interface, node);
 
 		if (handle && !data_was_deleted)
@@ -674,6 +686,19 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 
 		STARPU_TRACE_START_ALLOC(dst_node);
 		STARPU_ASSERT(replicate->data_interface);
+
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
+		{
+			/* To facilitate the design of interface, we set the
+			 * proper CUDA device in case it is needed. This avoids
+			 * having to set it again in the malloc method of each
+			 * interface. */
+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
+			STARPU_ASSERT(err == cudaSuccess);
+		}
+#endif
+
 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
 		STARPU_TRACE_END_ALLOC(dst_node);
 

+ 7 - 1
src/drivers/cuda/driver_cuda.c

@@ -158,6 +158,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 {
 	int ret;
 	uint32_t mask = 0;
+	cudaError_t cures;
 
 	STARPU_ASSERT(j);
 	struct starpu_task *task = j->task;
@@ -184,7 +185,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
 	if (calibrate_model)
 	{
-		cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
+		cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
 	}
@@ -195,6 +196,11 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 	int profiling = starpu_profiling_status_get();
 	profiling_info = task->profiling_info;
 
+#ifdef HAVE_CUDA_MEMCPY_PEER
+	/* We make sure we do manipulate the proper device */
+	cures = cudaSetDevice(args->devid);
+#endif
+
 	if ((profiling && profiling_info) || calibrate_model)
 	{
 		starpu_clock_gettime(&codelet_start);