14 years ago · e51515a4dd
--- a/configure.ac
+++ b/configure.ac
@@ -473,6 +473,18 @@ if test x$have_curand = xyes; then
 
				     STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcurand"
			
 
				 fi
			
 
				 
			
 
				+# Peer transfers are only supported since CUDA 4.0
			
 
				+have_cuda_memcpy_peer=no
			
 
				+if test x$enable_cuda = xyes; then
			
 
				+    SAVED_LDFLAGS="${LDFLAGS}"
			
 
				+    LDFLAGS="${LDFLAGS} ${STARPU_CUDA_LDFLAGS}"
			
 
				+    AC_CHECK_FUNC([cudaMemcpyPeer], have_cuda_memcpy_peer=yes, have_cuda_memcpy_peer=no)
			
 
				+    LDFLAGS="${SAVED_LDFLAGS}"
			
 
				+fi
			
 
				+if test x$have_cuda_memcpy_peer=yes; then
			
 
				+    AC_DEFINE(HAVE_CUDA_MEMCPY_PEER,[],[Peer transfers are supported in CUDA])
			
 
				+fi
			
 
				+
			
 
				 if test x$enable_cuda = xyes; then
			
 
				     STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lstdc++"
			
 
				     AC_SUBST(STARPU_CUDA_LDFLAGS)
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -107,19 +107,58 @@ void _starpu_update_data_state(starpu_data_handle handle,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static int worker_supports_direct_access(unsigned node)
			
 
				+{
			
 
				+	int type = _starpu_get_node_kind(node);
			
 
				+	switch (type)
			
 
				+	{
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+			/* GPUs not always allow direct remote access: if CUDA4
			
 
				+			 * is enabled, we allow two CUDA devices to communicate. */
			
 
				+			return 1;
			
 
				+#else
			
 
				+			/* Direct GPU-GPU transfers are not allowed in general */
			
 
				+			return 0;
			
 
				+#endif
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			return 0;
			
 
				+		default:
			
 
				+			return 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node)
			
 
				+{
			
 
				+	/* NB: when OpenCL and CUDA support peer transfers, we'll need to apply
			
 
				+	 * a little more checking here! */
			
 
				+
			
 
				+	/* XXX That's a hack until we get cudaMemcpy3DPeerAsync to work !
			
 
				+	 * Perhaps not all data interface provide a direct GPU-GPU transfer
			
 
				+	 * method ! */
			
 
				+	if (src_node != dst_node && _starpu_get_node_kind(src_node) == STARPU_CUDA_RAM && _starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+	{
			
 
				+		const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
			
 
				+		return (!!copy_methods->cuda_to_cuda_async);
			
 
				+	}
			
 
				+
			
 
				+	return (worker_supports_direct_access(src_node)
			
 
				+			&& worker_supports_direct_access(dst_node));
			
 
				+}
			
 
				+
			
 
				 /* Determines the path of a request : each hop is defined by (src,dst) and the
			
 
				  * node that handles the hop. The returned value indicates the number of hops,
			
 
				  * and the max_len is the maximum number of hops (ie. the size of the
			
 
				  * src_nodes, dst_nodes and handling_nodes arrays. */
			
 
				-static int determine_request_path(unsigned src_node, unsigned dst_node,
			
 
				+static int determine_request_path(starpu_data_handle handle,
			
 
				+				unsigned src_node, unsigned dst_node,
			
 
				 				starpu_access_mode mode, int max_len,
			
 
				 				unsigned *src_nodes, unsigned *dst_nodes,
			
 
				 				unsigned *handling_nodes)
			
 
				 {
			
 
				-	unsigned src_is_a_gpu = (_starpu_get_node_kind(src_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(src_node) == STARPU_OPENCL_RAM);
			
 
				-	unsigned dst_is_a_gpu = (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM || _starpu_get_node_kind(dst_node) == STARPU_OPENCL_RAM);
			
 
				+	int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node);
			
 
				 
			
 
				-	if ((mode & STARPU_R) && (src_is_a_gpu && dst_is_a_gpu)) {
			
 
				+	if (!link_is_valid && (mode & STARPU_R)) {
			
 
				 		/* We need an intermediate hop to implement data staging
			
 
				 		 * through main memory. */
			
 
				 		STARPU_ASSERT(max_len >= 2);
			
@@ -139,15 +178,17 @@ static int determine_request_path(unsigned src_node, unsigned dst_node,
 
				 		return 2;
			
 
				 	}
			
 
				 	else {
			
 
				+		unsigned handling_node;
			
 
				+		int src_supports_peer;
			
 
				+
			
 
				 		STARPU_ASSERT(max_len >= 1);
			
 
				 		
			
 
				-		unsigned handling_node;
			
 
				+		/* If we do have to perform a transfer (mode & STARPU_R), but
			
 
				+		 * the source does not support peer access, then we need to
			
 
				+		 * rely on the source to perform the transfer. */
			
 
				+		src_supports_peer = worker_supports_direct_access(src_node);
			
 
				+		handling_node = ((mode & STARPU_R) && !src_supports_peer)?src_node:dst_node;
			
 
				 
			
 
				-		/* The handling node is the GPU (if applicable), otherwise it's
			
 
				-		 * the destination node. If both src and dst are GPUs, we
			
 
				-		 * ensured that !(mode & STARPU_R), so we only need to allocate
			
 
				-		 * the data from the destination */
			
 
				-		handling_node = (!src_is_a_gpu && dst_is_a_gpu)?dst_node:src_node;
			
 
				 		src_nodes[0] = src_node;
			
 
				 		dst_nodes[0] = dst_node;
			
 
				 		handling_nodes[0] = handling_node;
			
@@ -251,7 +292,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	/* We can safely assume that there won't be more than 2 hops in the
			
 
				 	 * current implementation */
			
 
				 	unsigned src_nodes[4], dst_nodes[4], handling_nodes[4];
			
 
				-	int nhops = determine_request_path(src_node, requesting_node, mode, 4,
			
 
				+	int nhops = determine_request_path(handle, src_node, requesting_node, mode, 4,
			
 
				 					src_nodes, dst_nodes, handling_nodes);
			
 
				 	STARPU_ASSERT(nhops <= 4);
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -108,6 +108,15 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 	void *src_interface = src_replicate->data_interface;
			
 
				 	void *dst_interface = dst_replicate->data_interface;
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
			
 
				+	{
			
 
				+		int node = (dst_kind == STARPU_CUDA_RAM)?dst_node:src_node;
			
 
				+		cures = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+		STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind)) {
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
			
@@ -116,29 +125,22 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 		break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				-		/* CUBLAS_RAM -> STARPU_CPU_RAM */
			
 
				-		/* only the proper CUBLAS thread can initiate this ! */
			
 
				-		if (_starpu_get_local_memory_node() == src_node) {
			
 
				-			/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-			STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				-			if (!req || !copy_methods->cuda_to_ram_async) {
			
 
				-				/* this is not associated to a request so it's synchronous */
			
 
				-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				-			}
			
 
				-			else {
			
 
				-				cures = cudaEventCreate(&req->async_channel.cuda_event);
			
 
				-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-				stream = starpu_cuda_get_local_transfer_stream();
			
 
				-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				-
			
 
				-				cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				-				if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-			}
			
 
				+		/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+		STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				+		if (!req || !copy_methods->cuda_to_ram_async) {
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			/* we should not have a blocking call ! */
			
 
				-			STARPU_ABORT();
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			stream = starpu_cuda_get_local_transfer_stream();
			
 
				+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
@@ -151,13 +153,35 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			cures = cudaEventCreate(&req->async_channel.cuda_event);
			
 
				-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_stream();
			
 
				 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 
			
 
				-			cures = cudaEventRecord(req->async_channel.cuda_event, stream);
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+		}
			
 
				+		break;
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				+		/* CUDA - CUDA transfer */
			
 
				+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
			
 
				+		if (!req || !copy_methods->cuda_to_cuda_async) {
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				+		}
			
 
				+		else {
			
 
				+			req->async_channel.type = STARPU_CUDA_RAM;
			
 
				+			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
			
 
				+			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			stream = starpu_cuda_get_local_stream();
			
 
				+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+
			
 
				+			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 		}
			
 
				 		break;
			
@@ -172,7 +196,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 			}
			
 
				 			else {
			
 
				-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
			
 
				+				req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 			}
			
 
				 		}
			
 
				 		else {
			
@@ -189,7 +214,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle handle, struct starpu_dat
 
				 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else {
			
 
				-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.opencl_event));
			
 
				+			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				 		}
			
 
				 		break;
			
 
				 #endif
			
@@ -263,10 +289,9 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
			
 
				-					unsigned handling_node)
			
 
				+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
			
 
				+	starpu_node_kind kind = async_channel->type;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
 
				 	cudaError_t cures;
			
@@ -275,7 +300,7 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 	switch (kind) {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).cuda_event;
			
 
				+			event = (*async_channel).event.cuda_event;
			
 
				 
			
 
				 			cures = cudaEventSynchronize(event);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
@@ -290,10 +315,10 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 #ifdef STARPU_USE_OPENCL
			
 
				       case STARPU_OPENCL_RAM:
			
 
				          {
			
 
				-                 if ((*async_channel).opencl_event == NULL) STARPU_ABORT();
			
 
				-                 cl_int err = clWaitForEvents(1, &((*async_channel).opencl_event));
			
 
				+                 if ((*async_channel).event.opencl_event == NULL) STARPU_ABORT();
			
 
				+                 cl_int err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
			
 
				                  if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                 clReleaseEvent((*async_channel).opencl_event);
			
 
				+                 clReleaseEvent((*async_channel).event.opencl_event);
			
 
				          }
			
 
				          break;
			
 
				 #endif
			
@@ -303,10 +328,9 @@ void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel
 
				 	}
			
 
				 }
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel __attribute__ ((unused)),
			
 
				-					unsigned handling_node)
			
 
				+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel)
			
 
				 {
			
 
				-	starpu_node_kind kind = _starpu_get_node_kind(handling_node);
			
 
				+	starpu_node_kind kind = async_channel->type;
			
 
				 	unsigned success;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cudaEvent_t event;
			
@@ -315,7 +339,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
				 	switch (kind) {
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			event = (*async_channel).cuda_event;
			
 
				+			event = (*async_channel).event.cuda_event;
			
 
				 
			
 
				 			success = (cudaEventQuery(event) == cudaSuccess);
			
 
				 			if (success)
			
@@ -327,7 +351,7 @@ unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_chan
 
				       case STARPU_OPENCL_RAM:
			
 
				          {
			
 
				             cl_int event_status;
			
 
				-            cl_event opencl_event = (*async_channel).opencl_event;
			
 
				+            cl_event opencl_event = (*async_channel).event.opencl_event;
			
 
				             if (opencl_event == NULL) STARPU_ABORT();
			
 
				             cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
			
 
				             if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -46,7 +46,12 @@ typedef union {
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         cl_event opencl_event;
			
 
				 #endif
			
 
				-} starpu_async_channel;
			
 
				+} starpu_async_channel_event;
			
 
				+
			
 
				+struct starpu_async_channel {
			
 
				+	starpu_async_channel_event event;
			
 
				+	starpu_node_kind type;
			
 
				+};
			
 
				 
			
 
				 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid);
			
 
				 
			
@@ -57,6 +62,6 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle handle,
 
				 					struct starpu_data_request_s *req,
			
 
				 					unsigned may_alloc);
			
 
				 
			
 
				-unsigned _starpu_driver_test_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
			
 
				-void _starpu_driver_wait_request_completion(starpu_async_channel *async_channel, unsigned handling_node);
			
 
				+unsigned _starpu_driver_test_request_completion(struct starpu_async_channel *async_channel);
			
 
				+void _starpu_driver_wait_request_completion(struct starpu_async_channel *async_channel);
			
 
				 #endif // __COPY_DRIVER_H__
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -412,11 +412,11 @@ static void _handle_pending_node_data_requests(uint32_t src_node, unsigned force
 
				 		/* wait until the transfer is terminated */
			
 
				 		if (force)
			
 
				 		{
			
 
				-			_starpu_driver_wait_request_completion(&r->async_channel, src_node);
			
 
				+			_starpu_driver_wait_request_completion(&r->async_channel);
			
 
				 			starpu_handle_data_request_completion(r);
			
 
				 		}
			
 
				 		else {
			
 
				-			if (_starpu_driver_test_request_completion(&r->async_channel, src_node))
			
 
				+			if (_starpu_driver_test_request_completion(&r->async_channel))
			
 
				 			{
			
 
				 				/* The request was completed */
			
 
				 				starpu_handle_data_request_completion(r);
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -44,7 +44,7 @@ LIST_TYPE(starpu_data_request,
 
				 
			
 
				 	starpu_access_mode mode;
			
 
				 
			
 
				-	starpu_async_channel async_channel;
			
 
				+	struct starpu_async_channel async_channel;
			
 
				 
			
 
				 	unsigned completed;
			
 
				 	int retval;
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -28,15 +28,18 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				 #endif
			
 
				 
			
 
				 static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
			
@@ -46,6 +49,9 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				+	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -408,20 +414,203 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				+{
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	int synchronous_fallback = 0;
			
 
				+
			
 
				+	cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
			
 
				+	if (cures)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+	
			
 
				+	if (synchronous_fallback)
			
 
				+	{
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		return 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_peer(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				+#else
			
 
				+	STARPU_ABORT();
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_peer_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+{
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	starpu_csr_interface_t *src_csr = src_interface;
			
 
				+	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				+
			
 
				+	uint32_t nnz = src_csr->nnz;
			
 
				+	uint32_t nrow = src_csr->nrow;
			
 
				+	size_t elemsize = src_csr->elemsize;
			
 
				+
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	int synchronous_fallback = 0;
			
 
				+
			
 
				+	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
			
 
				+	if (cures)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+
			
 
				+	if (!synchronous_fallback)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
			
 
				+	}
			
 
				+
			
 
				+	if (synchronous_fallback || cures != cudaSuccess)
			
 
				+	{
			
 
				+		synchronous_fallback = 1;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	}
			
 
				+	
			
 
				+	if (synchronous_fallback)
			
 
				+	{
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+		return 0;
			
 
				+	}
			
 
				+	else {
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+#else
			
 
				+	/* Illegal without Peer tranfers */
			
 
				+	STARPU_ABORT();
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				 	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	else
			
 
				+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				+}
			
 
				+
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
			
 
				+	else
			
 
				+		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				 }
			
 
				+
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -32,6 +32,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+//static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -49,6 +50,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+//	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -350,17 +352,48 @@ static void free_matrix_buffer_on_node(void *data_interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				-
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				+
			
 
				+#if 0
			
 
				+
			
 
				+	struct cudaMemcpy3DParms p;
			
 
				+	memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->ld * src_matrix->ny *elemsize, src_matrix->ny);
			
 
				+	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->ld * src_matrix->ny *elemsize, dst_matrix->ny);
			
 
				+	p.extent = make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+	p.kind = kind;
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpy3DAsync(&p, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpy3D(&p);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-			src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				+			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				+		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				+		src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -369,81 +402,126 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+/* XXX this is broken : we need to find a way to fix that ! */
			
 
				+#if 0
			
 
				+static int copy_cuda_peer(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), int is_async, cudaStream_t stream)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				 
			
 
				 	size_t elemsize = src_matrix->elemsize;
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				-	cudaError_t cures;	
			
 
				-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
			
 
				-			(size_t)src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-			cudaMemcpyDeviceToHost, stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, (size_t)src_matrix->ld*elemsize,
			
 
				-			(size_t)src_matrix->nx*elemsize, (size_t)src_matrix->ny,
			
 
				-			cudaMemcpyDeviceToHost);
			
 
				+#if 1
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				 
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	struct cudaExtent extent = make_cudaExtent(128, 128, 128);
			
 
				 
			
 
				-		return 0;
			
 
				+	cures = cudaSetDevice(src_dev);
			
 
				+	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+
			
 
				+	struct cudaPitchedPtr mem_device1;
			
 
				+	cures = cudaMalloc3D(&mem_device1, extent);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	cures = cudaSetDevice(dst_dev);
			
 
				+	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+
			
 
				+	struct cudaPitchedPtr mem_device2;
			
 
				+	cures = cudaMalloc3D(&mem_device2, extent);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	struct cudaMemcpy3DPeerParms p;
			
 
				+	memset(&p, 0, sizeof(p));
			
 
				+	p.srcDevice = src_dev;
			
 
				+	p.dstDevice = dst_dev;
			
 
				+	p.srcPtr = mem_device1;
			
 
				+	p.dstPtr = mem_device2;
			
 
				+	p.extent = extent;
			
 
				+
			
 
				+	cures = cudaMemcpy3DPeer(&p);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+
			
 
				+//make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
			
 
				+//make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, src_matrix->nx, dst_matrix->ny);
			
 
				+//make_cudaExtent(src_matrix->nx, src_matrix->ny, 1);
			
 
				+
			
 
				+//	if (is_async)
			
 
				+//	{
			
 
				+//		cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+//		if (!cures)
			
 
				+//			return -EAGAIN;
			
 
				+//	}
			
 
				+
			
 
				+#else
			
 
				+	/* XXX FIXME !!*/
			
 
				+	STARPU_ASSERT(src_matrix->nx == src_matrix->ld);
			
 
				+	STARPU_ASSERT(dst_matrix->nx == dst_matrix->ld);
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				 	}
			
 
				 
			
 
				+	cures = cudaMemcpyPeer((char *)dst_matrix->ptr, dst_dev, (char *)src_matrix->ptr, src_dev, dst_matrix->nx*dst_matrix->ny*elemsize);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+#endif
			
 
				+
			
 
				 	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+	return 0;
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
			
 
				+}
			
 
				 
			
 
				-	size_t elemsize = src_matrix->elemsize;
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
			
 
				+}
			
 
				 
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-				src_matrix->nx*elemsize, src_matrix->ny,
			
 
				-				cudaMemcpyHostToDevice, stream);
			
 
				-	if (cures)
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
			
 
				+	else
			
 
				 	{
			
 
				-		cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-				(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-				src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				+		/* XXX not implemented */
			
 
				+		STARPU_ABORT();
			
 
				 		return 0;
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
			
 
				+}
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
			
 
				 }
			
 
				 
			
 
				+#if 0
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
			
 
				+	else
			
 
				+		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				+}
			
 
				+#endif
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -31,6 +31,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_in
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream);
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -47,9 +48,10 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -300,7 +302,31 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+		starpu_variable_interface_t *src_variable = src_interface;
			
 
				+		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+
			
 
				+		cudaError_t cures;
			
 
				+		cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, src_variable->elemsize);
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+
			
 
				+#else
			
 
				+		/* This is illegal without support for cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+#endif
			
 
				+		return 0;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
@@ -340,6 +366,47 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 
				 {
			
 
				 	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				+
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+		int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+		starpu_variable_interface_t *src_variable = src_interface;
			
 
				+		starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				+
			
 
				+		size_t length = src_variable->elemsize;
			
 
				+
			
 
				+		cudaError_t cures;
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length, stream);
			
 
				+		if (cures)
			
 
				+		{
			
 
				+			/* sychronous fallback */
			
 
				+			cures = cudaMemcpyPeer((char *)dst_variable->ptr, dst_dev, (char *)src_variable->ptr, src_dev, length);
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+
			
 
				+		STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+
			
 
				+		return -EAGAIN;
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -32,6 +32,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
@@ -50,6 +51,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				 	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -282,6 +284,10 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 
				 {
			
 
				 	starpu_vector_interface_t *vector_interface = data_interface;
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	cudaError_t cures;
			
 
				+#endif
			
 
				+
			
 
				 	starpu_node_kind kind = _starpu_get_node_kind(node);
			
 
				 	switch(kind) {
			
 
				 		case STARPU_CPU_RAM:
			
@@ -289,7 +295,8 @@ static void free_vector_buffer_on_node(void *data_interface, uint32_t node)
 
				 			break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case STARPU_CUDA_RAM:
			
 
				-			cudaFree((void*)vector_interface->ptr);
			
 
				+			cures = cudaFree((void*)vector_interface->ptr);
			
 
				+			STARPU_ASSERT(cures == cudaSuccess);
			
 
				 			break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -310,6 +317,7 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				+
			
 
				 	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -319,6 +327,40 @@ static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+static int copy_cuda_peer_common(void *src_interface, unsigned src_node,
			
 
				+				void *dst_interface, unsigned dst_node,
			
 
				+				int is_async, cudaStream_t stream)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+
			
 
				+	starpu_vector_interface_t *src_vector = src_interface;
			
 
				+	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				+
			
 
				+	size_t length = src_vector->nx*src_vector->elemsize;
			
 
				+
			
 
				+	int src_dev = starpu_memory_node_to_devid(src_node);
			
 
				+	int dst_dev = starpu_memory_node_to_devid(dst_node);
			
 
				+
			
 
				+	if (is_async)
			
 
				+	{
			
 
				+		cures = cudaMemcpyPeerAsync((char *)dst_vector->ptr, dst_dev,
			
 
				+						(char *)src_vector->ptr, src_dev,
			
 
				+						length, stream);
			
 
				+		if (!cures)
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cures = cudaMemcpyPeer((char *)dst_vector->ptr, dst_dev,
			
 
				+				(char *)src_vector->ptr, src_dev, length);
			
 
				+	if (STARPU_UNLIKELY(cures))
			
 
				+		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, length);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif
			
 
				 
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
@@ -335,7 +377,19 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 0, 0);
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
@@ -346,6 +400,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				+
			
 
				 	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, stream);
			
 
				 	if (cures)
			
 
				 	{
			
@@ -362,6 +417,22 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node __attri
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				+{
			
 
				+	if (src_node == dst_node)
			
 
				+	{
			
 
				+		return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				+	}
			
 
				+	else {
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+		return copy_cuda_peer_common(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				+#else
			
 
				+		/* This is illegal without cudaMemcpyPeer */
			
 
				+		STARPU_ABORT();
			
 
				+		return 0;
			
 
				+#endif
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -39,8 +39,10 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = dummy_copy,
			
 
				 	.cuda_to_ram = dummy_copy,
			
 
				+	.cuda_to_cuda = dummy_copy,
			
 
				 	.ram_to_cuda_async = dummy_cuda_copy_async,
			
 
				 	.cuda_to_ram_async = dummy_cuda_copy_async,
			
 
				+	.cuda_to_cuda_async = dummy_cuda_copy_async,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = dummy_copy,
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -210,6 +210,18 @@ static size_t free_memory_on_node(starpu_mem_chunk_t mc, uint32_t node)
 
				 		if (handle && !data_was_deleted)
			
 
				 			STARPU_ASSERT(replicate->allocated);
			
 
				 
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+		if (_starpu_get_node_kind(node) == STARPU_CUDA_RAM)
			
 
				+		{
			
 
				+			/* To facilitate the design of interface, we set the
			
 
				+			 * proper CUDA device in case it is needed. This avoids
			
 
				+			 * having to set it again in the free method of each
			
 
				+			 * interface. */
			
 
				+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(node));
			
 
				+			STARPU_ASSERT(err == cudaSuccess);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				 		mc->ops->free_data_on_node(mc->chunk_interface, node);
			
 
				 
			
 
				 		if (handle && !data_was_deleted)
			
@@ -674,6 +686,19 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 
				 
			
 
				 		STARPU_TRACE_START_ALLOC(dst_node);
			
 
				 		STARPU_ASSERT(replicate->data_interface);
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				+		if (_starpu_get_node_kind(dst_node) == STARPU_CUDA_RAM)
			
 
				+		{
			
 
				+			/* To facilitate the design of interface, we set the
			
 
				+			 * proper CUDA device in case it is needed. This avoids
			
 
				+			 * having to set it again in the malloc method of each
			
 
				+			 * interface. */
			
 
				+			cudaError_t err = cudaSetDevice(starpu_memory_node_to_devid(dst_node));
			
 
				+			STARPU_ASSERT(err == cudaSuccess);
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				 		allocated_memory = handle->ops->allocate_data_on_node(replicate->data_interface, dst_node);
			
 
				 		STARPU_TRACE_END_ALLOC(dst_node);
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -158,6 +158,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 {
			
 
				 	int ret;
			
 
				 	uint32_t mask = 0;
			
 
				+	cudaError_t cures;
			
 
				 
			
 
				 	STARPU_ASSERT(j);
			
 
				 	struct starpu_task *task = j->task;
			
@@ -184,7 +185,7 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 
			
 
				 	if (calibrate_model)
			
 
				 	{
			
 
				-		cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
			
 
				+		cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	}
			
@@ -195,6 +196,11 @@ static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 	profiling_info = task->profiling_info;
			
 
				 
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	/* We make sure we do manipulate the proper device */
			
 
				+	cures = cudaSetDevice(args->devid);
			
 
				+#endif
			
 
				+
			
 
				 	if ((profiling && profiling_info) || calibrate_model)
			
 
				 	{
			
 
				 		starpu_clock_gettime(&codelet_start);