16 年之前 · 056e59082a
--- a/src/datawizard/copy-driver.c
+++ b/src/datawizard/copy-driver.c
@@ -91,6 +91,85 @@ nomem:
 
				 static unsigned communication_cnt = 0;
			
 
				 #endif
			
 
				 
			
 
				+static int copy_data_1_to_1_generic(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	//ret = state->ops->copy_data_1_to_1(state, src_node, dst_node);
			
 
				+
			
 
				+	const struct copy_data_methods_s *copy_methods = state->ops->copy_methods;
			
 
				+
			
 
				+	node_kind src_kind = get_node_kind(src_node);
			
 
				+	node_kind dst_kind = get_node_kind(dst_node);
			
 
				+
			
 
				+	switch (dst_kind) {
			
 
				+	case RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> RAM */
			
 
				+				 copy_methods->ram_to_ram(state, src_node, dst_node);
			
 
				+				 break;
			
 
				+#ifdef USE_CUDA
			
 
				+			case CUDA_RAM:
			
 
				+				/* CUBLAS_RAM -> RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				if (get_local_memory_node() == src_node)
			
 
				+				{
			
 
				+					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				+					copy_methods->cuda_to_ram(state, src_node, dst_node);
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					/* put a request to the corresponding GPU */
			
 
				+					post_data_request(state, src_node, dst_node);
			
 
				+				}
			
 
				+				break;
			
 
				+#endif
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+				printf("error node %d UNUSED\n", src_node);
			
 
				+			default:
			
 
				+				assert(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#ifdef USE_CUDA
			
 
				+	case CUDA_RAM:
			
 
				+		switch (src_kind) {
			
 
				+			case RAM:
			
 
				+				/* RAM -> CUBLAS_RAM */
			
 
				+				/* only the proper CUBLAS thread can initiate this ! */
			
 
				+				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				+				copy_methods->ram_to_cuda(state, src_node, dst_node);
			
 
				+				break;
			
 
				+			case CUDA_RAM:
			
 
				+			case SPU_LS:
			
 
				+				STARPU_ASSERT(0); // TODO 
			
 
				+				break;
			
 
				+			case UNUSED:
			
 
				+			default:
			
 
				+				STARPU_ASSERT(0);
			
 
				+				break;
			
 
				+		}
			
 
				+		break;
			
 
				+#endif
			
 
				+	case SPU_LS:
			
 
				+		STARPU_ASSERT(0); // TODO
			
 
				+		break;
			
 
				+	case UNUSED:
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	/* XXX */
			
 
				+	ret = 0;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *state, uint32_t src_node, 
			
 
				 				uint32_t dst_node, unsigned donotread)
			
 
				 {
			
@@ -106,7 +185,7 @@ int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *stat
 
				 	 * we do not perform any transfer */
			
 
				 	if (!donotread) {
			
 
				 		STARPU_ASSERT(state->ops);
			
 
				-		STARPU_ASSERT(state->ops->copy_data_1_to_1);
			
 
				+		//STARPU_ASSERT(state->ops->copy_data_1_to_1);
			
 
				 
			
 
				 #ifdef DATA_STATS
			
 
				 		size_t size = state->ops->get_size(state);
			
@@ -119,7 +198,7 @@ int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *stat
 
				 
			
 
				 		/* for now we set the size to 0 in the FxT trace XXX */
			
 
				 		TRACE_START_DRIVER_COPY(src_node, dst_node, 0, com_id);
			
 
				-		ret_copy = state->ops->copy_data_1_to_1(state, src_node, dst_node);
			
 
				+		ret_copy = copy_data_1_to_1_generic(state, src_node, dst_node);
			
 
				 		TRACE_END_DRIVER_COPY(src_node, dst_node, 0, com_id);
			
 
				 
			
 
				 		return ret_copy;
			
--- a/src/datawizard/copy-driver.h
+++ b/src/datawizard/copy-driver.h
@@ -28,6 +28,23 @@
 
				 
			
 
				 struct starpu_data_state_t;
			
 
				 
			
 
				+struct copy_data_methods_s {
			
 
				+	/* src type is ram */
			
 
				+	int (*ram_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*ram_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*ram_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+
			
 
				+	/* src type is cuda */
			
 
				+	int (*cuda_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*cuda_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*cuda_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+
			
 
				+	/* src type is spu */
			
 
				+	int (*spu_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*spu_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+	int (*spu_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
			
 
				+};
			
 
				+
			
 
				 __attribute__((warn_unused_result))
			
 
				 int driver_copy_data(struct starpu_data_state_t *state, 
			
 
				 			uint32_t src_node_mask,
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -29,17 +29,37 @@
 
				 /*
			
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				  */
			
 
				+
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#ifdef USE_CUDA
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				+
			
 
				+static const struct copy_data_methods_s bcsr_copy_data_methods_s = {
			
 
				+	.ram_to_ram = dummy_copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cublas,
			
 
				+	.cuda_to_ram = copy_cublas_to_ram,
			
 
				+#endif
			
 
				+	.cuda_to_cuda = NULL,
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				 size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
			
 
				 void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				 size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer);
			
 
				-int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				 size_t bcsr_interface_get_size(struct starpu_data_state_t *state);
			
 
				 uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate);
			
 
				 
			
 
				 struct data_interface_ops_t interface_bcsr_ops = {
			
 
				 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
			
 
				 	.liberate_data_on_node = liberate_bcsr_buffer_on_node,
			
 
				-	.copy_data_1_to_1 = do_copy_bcsr_buffer_1_to_1,
			
 
				+	.copy_methods = &bcsr_copy_data_methods_s,
			
 
				 	.dump_data_interface = dump_bcsr_interface,
			
 
				 	.get_size = bcsr_interface_get_size,
			
 
				 	.interfaceid = BCSR_INTERFACE,
			
@@ -339,7 +359,7 @@ void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t n
 
				 }
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				-static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr;
			
@@ -365,9 +385,10 @@ static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_n
 
				 	
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr;
			
@@ -392,11 +413,13 @@ static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_n
 
				 						(uint8_t *)dst_bcsr->rowptr, 1);
			
 
				 	
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif // USE_CUDA
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 
			
 
				 	starpu_bcsr_interface_t *src_bcsr;
			
@@ -419,73 +442,6 @@ static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t sr
 
				 	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				-{
			
 
				-	node_kind src_kind = get_node_kind(src_node);
			
 
				-	node_kind dst_kind = get_node_kind(dst_node);
			
 
				-
			
 
				-	switch (dst_kind) {
			
 
				-	case RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> RAM */
			
 
				-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				-				 break;
			
 
				-#ifdef USE_CUDA
			
 
				-			case CUDA_RAM:
			
 
				-				/* CUBLAS_RAM -> RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				if (get_local_memory_node() == src_node)
			
 
				-				{
			
 
				-					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					post_data_request(state, src_node, dst_node);
			
 
				-				}
			
 
				-				break;
			
 
				-#endif
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-				printf("error node %d UNUSED\n", src_node);
			
 
				-			default:
			
 
				-				assert(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#ifdef USE_CUDA
			
 
				-	case CUDA_RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> CUBLAS_RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				-				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				-				break;
			
 
				-			case CUDA_RAM:
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO 
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#endif
			
 
				-	case SPU_LS:
			
 
				-		STARPU_ASSERT(0); // TODO
			
 
				-		break;
			
 
				-	case UNUSED:
			
 
				-	default:
			
 
				-		assert(0);
			
 
				-		break;
			
 
				-	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/blas_interface.c
+++ b/src/datawizard/interfaces/blas_interface.c
@@ -28,9 +28,28 @@
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#ifdef USE_CUDA
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				+
			
 
				+static const struct copy_data_methods_s blas_copy_data_methods_s = {
			
 
				+	.ram_to_ram = dummy_copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cublas,
			
 
				+	.cuda_to_ram = copy_cublas_to_ram,
			
 
				+#endif
			
 
				+	.cuda_to_cuda = NULL,
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				 size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node);
			
 
				 void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				-int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
			
 
				 size_t dump_blas_interface(starpu_data_interface_t *interface, void *buffer);
			
 
				 size_t blas_interface_get_size(struct starpu_data_state_t *state);
			
 
				 uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate);
			
@@ -42,7 +61,7 @@ int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, go
 
				 struct data_interface_ops_t interface_blas_ops = {
			
 
				 	.allocate_data_on_node = allocate_blas_buffer_on_node,
			
 
				 	.liberate_data_on_node = liberate_blas_buffer_on_node,
			
 
				-	.copy_data_1_to_1 = do_copy_blas_buffer_1_to_1,
			
 
				+	.copy_methods = &blas_copy_data_methods_s,
			
 
				 	.dump_data_interface = dump_blas_interface,
			
 
				 	.get_size = blas_interface_get_size,
			
 
				 	.footprint = footprint_blas_interface_crc32,
			
@@ -279,7 +298,7 @@ void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t n
 
				 }
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_blas_interface_t *src_blas;
			
 
				 	starpu_blas_interface_t *dst_blas;
			
@@ -292,10 +311,13 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 
				 		(uint8_t *)dst_blas->ptr, dst_blas->ld);
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				+
			
 
				 	starpu_blas_interface_t *src_blas;
			
 
				 	starpu_blas_interface_t *dst_blas;
			
 
				 
			
@@ -308,11 +330,13 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 
				 		(uint8_t *)dst_blas->ptr, dst_blas->ld);
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif // USE_CUDA
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	unsigned y;
			
 
				 	uint32_t nx = state->interface[dst_node].blas.nx;
			
@@ -336,77 +360,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 
				 	}
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				-{
			
 
				-	node_kind src_kind = get_node_kind(src_node);
			
 
				-	node_kind dst_kind = get_node_kind(dst_node);
			
 
				-
			
 
				-	switch (dst_kind) {
			
 
				-	case RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> RAM */
			
 
				-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				-				 break;
			
 
				-#ifdef USE_CUDA
			
 
				-			case CUDA_RAM:
			
 
				-				/* CUBLAS_RAM -> RAM */
			
 
				-				if (get_local_memory_node() == src_node)
			
 
				-				{
			
 
				-					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					/* put a request to the corresponding GPU */
			
 
				-		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
			
 
				-					post_data_request(state, src_node, dst_node);
			
 
				-		//			fprintf(stderr, "post %p OK\n", state);
			
 
				-				}
			
 
				-				break;
			
 
				-#endif
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-				printf("error node %d UNUSED\n", src_node);
			
 
				-			default:
			
 
				-				assert(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#ifdef USE_CUDA
			
 
				-	case CUDA_RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> CUBLAS_RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				-				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				-				break;
			
 
				-			case CUDA_RAM:
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO 
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#endif
			
 
				-	case SPU_LS:
			
 
				-		STARPU_ASSERT(0); // TODO
			
 
				-		break;
			
 
				-	case UNUSED:
			
 
				-	default:
			
 
				-		assert(0);
			
 
				-		break;
			
 
				-	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -28,9 +28,29 @@
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#ifdef USE_CUDA
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				+
			
 
				+static const struct copy_data_methods_s block_copy_data_methods_s = {
			
 
				+	.ram_to_ram = dummy_copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cublas,
			
 
				+	.cuda_to_ram = copy_cublas_to_ram,
			
 
				+#endif
			
 
				+	.cuda_to_cuda = NULL,
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				+
			
 
				 size_t allocate_block_buffer_on_node(data_state *state, uint32_t dst_node);
			
 
				 void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				-int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
			
 
				 size_t dump_block_interface(starpu_data_interface_t *interface, void *buffer);
			
 
				 size_t block_interface_get_size(struct starpu_data_state_t *state);
			
 
				 uint32_t footprint_block_interface_crc32(data_state *state, uint32_t hstate);
			
@@ -42,7 +62,7 @@ int convert_block_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, g
 
				 struct data_interface_ops_t interface_block_ops = {
			
 
				 	.allocate_data_on_node = allocate_block_buffer_on_node,
			
 
				 	.liberate_data_on_node = liberate_block_buffer_on_node,
			
 
				-	.copy_data_1_to_1 = do_copy_block_buffer_1_to_1,
			
 
				+	.copy_methods = &block_copy_data_methods_s,
			
 
				 	.dump_data_interface = dump_block_interface,
			
 
				 	.get_size = block_interface_get_size,
			
 
				 	.footprint = footprint_block_interface_crc32,
			
@@ -296,7 +316,7 @@ void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t
 
				 }
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block;
			
 
				 	starpu_block_interface_t *dst_block;
			
@@ -326,9 +346,11 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 
				 	}
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block;
			
 
				 	starpu_block_interface_t *dst_block;
			
@@ -358,11 +380,13 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 
				 	}
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif // USE_CUDA
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	uint32_t nx = state->interface[dst_node].block.nx;
			
 
				 	uint32_t ny = state->interface[dst_node].block.ny;
			
@@ -389,77 +413,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 
				 	}
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				-{
			
 
				-	node_kind src_kind = get_node_kind(src_node);
			
 
				-	node_kind dst_kind = get_node_kind(dst_node);
			
 
				-
			
 
				-	switch (dst_kind) {
			
 
				-	case RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> RAM */
			
 
				-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				-				 break;
			
 
				-#ifdef USE_CUDA
			
 
				-			case CUDA_RAM:
			
 
				-				/* CUBLAS_RAM -> RAM */
			
 
				-				if (get_local_memory_node() == src_node)
			
 
				-				{
			
 
				-					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					/* put a request to the corresponding GPU */
			
 
				-		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
			
 
				-					post_data_request(state, src_node, dst_node);
			
 
				-		//			fprintf(stderr, "post %p OK\n", state);
			
 
				-				}
			
 
				-				break;
			
 
				-#endif
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-				printf("error node %d UNUSED\n", src_node);
			
 
				-			default:
			
 
				-				assert(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#ifdef USE_CUDA
			
 
				-	case CUDA_RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> CUBLAS_RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				-				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				-				break;
			
 
				-			case CUDA_RAM:
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO 
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#endif
			
 
				-	case SPU_LS:
			
 
				-		STARPU_ASSERT(0); // TODO
			
 
				-		break;
			
 
				-	case UNUSED:
			
 
				-	default:
			
 
				-		assert(0);
			
 
				-		break;
			
 
				-	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -27,17 +27,37 @@
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#ifdef USE_CUDA
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				+
			
 
				+static const struct copy_data_methods_s csr_copy_data_methods_s = {
			
 
				+	.ram_to_ram = dummy_copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cublas,
			
 
				+	.cuda_to_ram = copy_cublas_to_ram,
			
 
				+#endif
			
 
				+	.cuda_to_cuda = NULL,
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				+
			
 
				 size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
			
 
				 void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				 size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer);
			
 
				-int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				 size_t csr_interface_get_size(struct starpu_data_state_t *state);
			
 
				 uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate);
			
 
				 
			
 
				 struct data_interface_ops_t interface_csr_ops = {
			
 
				 	.allocate_data_on_node = allocate_csr_buffer_on_node,
			
 
				 	.liberate_data_on_node = liberate_csr_buffer_on_node,
			
 
				-	.copy_data_1_to_1 = do_copy_csr_buffer_1_to_1,
			
 
				+	.copy_methods = &csr_copy_data_methods_s,
			
 
				 	.dump_data_interface = dump_csr_interface,
			
 
				 	.get_size = csr_interface_get_size,
			
 
				 	.interfaceid = CSR_INTERFACE,
			
@@ -308,7 +328,7 @@ void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t no
 
				 }
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				-static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr;
			
 
				 	starpu_csr_interface_t *dst_csr;
			
@@ -331,9 +351,10 @@ static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_n
 
				 	
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr;
			
 
				 	starpu_csr_interface_t *dst_csr;
			
@@ -355,11 +376,13 @@ static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_n
 
				 						(uint8_t *)dst_csr->rowptr, 1);
			
 
				 	
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif // USE_CUDA
			
 
				 
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 
			
 
				 	starpu_csr_interface_t *src_csr;
			
@@ -379,73 +402,6 @@ static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t sr
 
				 	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-}
			
 
				-
			
 
				-
			
 
				-int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
			
 
				-{
			
 
				-	node_kind src_kind = get_node_kind(src_node);
			
 
				-	node_kind dst_kind = get_node_kind(dst_node);
			
 
				-
			
 
				-	switch (dst_kind) {
			
 
				-	case RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> RAM */
			
 
				-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				-				 break;
			
 
				-#ifdef USE_CUDA
			
 
				-			case CUDA_RAM:
			
 
				-				/* CUBLAS_RAM -> RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				if (get_local_memory_node() == src_node)
			
 
				-				{
			
 
				-					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					post_data_request(state, src_node, dst_node);
			
 
				-				}
			
 
				-				break;
			
 
				-#endif
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-				printf("error node %d UNUSED\n", src_node);
			
 
				-			default:
			
 
				-				assert(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#ifdef USE_CUDA
			
 
				-	case CUDA_RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> CUBLAS_RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				-				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				-				break;
			
 
				-			case CUDA_RAM:
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO 
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#endif
			
 
				-	case SPU_LS:
			
 
				-		STARPU_ASSERT(0); // TODO
			
 
				-		break;
			
 
				-	case UNUSED:
			
 
				-	default:
			
 
				-		assert(0);
			
 
				-		break;
			
 
				-	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -42,8 +42,7 @@ struct data_interface_ops_t {
 
				 					uint32_t node);
			
 
				 	void (*liberate_data_on_node)(starpu_data_interface_t *interface,
			
 
				 					uint32_t node);
			
 
				-	int (*copy_data_1_to_1)(struct starpu_data_state_t *state, 
			
 
				-					uint32_t src, uint32_t dst);
			
 
				+	const struct copy_data_methods_s *copy_methods;
			
 
				 	size_t (*dump_data_interface)(starpu_data_interface_t *interface, 
			
 
				 					void *buffer);
			
 
				 	size_t (*get_size)(struct starpu_data_state_t *state);
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -28,9 +28,28 @@
 
				 #include <cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#ifdef USE_CUDA
			
 
				+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
			
 
				+#endif
			
 
				+
			
 
				+static const struct copy_data_methods_s vector_copy_data_methods_s = {
			
 
				+	.ram_to_ram = dummy_copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef USE_CUDA
			
 
				+	.ram_to_cuda = copy_ram_to_cublas,
			
 
				+	.cuda_to_ram = copy_cublas_to_ram,
			
 
				+#endif
			
 
				+	.cuda_to_cuda = NULL,
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu = NULL
			
 
				+};
			
 
				+
			
 
				 size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node);
			
 
				 void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
			
 
				-int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
			
 
				 size_t dump_vector_interface(starpu_data_interface_t *interface, void *buffer);
			
 
				 size_t vector_interface_get_size(struct starpu_data_state_t *state);
			
 
				 uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate);
			
@@ -42,7 +61,7 @@ int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr,
 
				 struct data_interface_ops_t interface_vector_ops = {
			
 
				 	.allocate_data_on_node = allocate_vector_buffer_on_node,
			
 
				 	.liberate_data_on_node = liberate_vector_buffer_on_node,
			
 
				-	.copy_data_1_to_1 = do_copy_vector_buffer_1_to_1,
			
 
				+	.copy_methods = &vector_copy_data_methods_s,
			
 
				 	.dump_data_interface = dump_vector_interface,
			
 
				 	.get_size = vector_interface_get_size,
			
 
				 	.footprint = footprint_vector_interface_crc32,
			
@@ -223,7 +242,7 @@ void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t
 
				 }
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector;
			
 
				 	starpu_vector_interface_t *dst_vector;
			
@@ -236,9 +255,11 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 
				 		(uint8_t *)dst_vector->ptr, 1);
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector;
			
 
				 	starpu_vector_interface_t *dst_vector;
			
@@ -251,10 +272,12 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 
				 		(uint8_t *)dst_vector->ptr, 1);
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 #endif // USE_CUDA
			
 
				 
			
 
				-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				 {
			
 
				 	uint32_t nx = state->interface[dst_node].vector.nx;
			
 
				 	size_t elemsize = state->interface[dst_node].vector.elemsize;
			
@@ -265,75 +288,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 
				 	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
			
 
				 
			
 
				 	TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
			
 
				-}
			
 
				-
			
 
				-int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
			
 
				-{
			
 
				-	node_kind src_kind = get_node_kind(src_node);
			
 
				-	node_kind dst_kind = get_node_kind(dst_node);
			
 
				-
			
 
				-	switch (dst_kind) {
			
 
				-	case RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> RAM */
			
 
				-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
			
 
				-				 break;
			
 
				-#ifdef USE_CUDA
			
 
				-			case CUDA_RAM:
			
 
				-				/* CUBLAS_RAM -> RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				if (get_local_memory_node() == src_node)
			
 
				-				{
			
 
				-					/* only the proper CUBLAS thread can initiate this directly ! */
			
 
				-					copy_cublas_to_ram(state, src_node, dst_node);
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					/* put a request to the corresponding GPU */
			
 
				-					post_data_request(state, src_node, dst_node);
			
 
				-				}
			
 
				-				break;
			
 
				-#endif
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-				printf("error node %d UNUSED\n", src_node);
			
 
				-			default:
			
 
				-				assert(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#ifdef USE_CUDA
			
 
				-	case CUDA_RAM:
			
 
				-		switch (src_kind) {
			
 
				-			case RAM:
			
 
				-				/* RAM -> CUBLAS_RAM */
			
 
				-				/* only the proper CUBLAS thread can initiate this ! */
			
 
				-				STARPU_ASSERT(get_local_memory_node() == dst_node);
			
 
				-				copy_ram_to_cublas(state, src_node, dst_node);
			
 
				-				break;
			
 
				-			case CUDA_RAM:
			
 
				-			case SPU_LS:
			
 
				-				STARPU_ASSERT(0); // TODO 
			
 
				-				break;
			
 
				-			case UNUSED:
			
 
				-			default:
			
 
				-				STARPU_ASSERT(0);
			
 
				-				break;
			
 
				-		}
			
 
				-		break;
			
 
				-#endif
			
 
				-	case SPU_LS:
			
 
				-		STARPU_ASSERT(0); // TODO
			
 
				-		break;
			
 
				-	case UNUSED:
			
 
				-	default:
			
 
				-		assert(0);
			
 
				-		break;
			
 
				-	}
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-