Browse Source

Factorize the data transfer code, it should be a little easier to define a new
data interface.

Cédric Augonnet 16 years ago
parent
commit
056e59082a

+ 81 - 2
src/datawizard/copy-driver.c

@@ -91,6 +91,85 @@ nomem:
 static unsigned communication_cnt = 0;
 #endif
 
+static int copy_data_1_to_1_generic(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	int ret;
+
+	//ret = state->ops->copy_data_1_to_1(state, src_node, dst_node);
+
+	const struct copy_data_methods_s *copy_methods = state->ops->copy_methods;
+
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 copy_methods->ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				if (get_local_memory_node() == src_node)
+				{
+					/* only the proper CUBLAS thread can initiate this directly ! */
+					copy_methods->cuda_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					/* put a request to the corresponding GPU */
+					post_data_request(state, src_node, dst_node);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_methods->ram_to_cuda(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	/* XXX */
+	ret = 0;
+
+	return ret;
+}
+
 int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *state, uint32_t src_node, 
 				uint32_t dst_node, unsigned donotread)
 {
@@ -106,7 +185,7 @@ int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *stat
 	 * we do not perform any transfer */
 	if (!donotread) {
 		STARPU_ASSERT(state->ops);
-		STARPU_ASSERT(state->ops->copy_data_1_to_1);
+		//STARPU_ASSERT(state->ops->copy_data_1_to_1);
 
 #ifdef DATA_STATS
 		size_t size = state->ops->get_size(state);
@@ -119,7 +198,7 @@ int __attribute__((warn_unused_result)) driver_copy_data_1_to_1(data_state *stat
 
 		/* for now we set the size to 0 in the FxT trace XXX */
 		TRACE_START_DRIVER_COPY(src_node, dst_node, 0, com_id);
-		ret_copy = state->ops->copy_data_1_to_1(state, src_node, dst_node);
+		ret_copy = copy_data_1_to_1_generic(state, src_node, dst_node);
 		TRACE_END_DRIVER_COPY(src_node, dst_node, 0, com_id);
 
 		return ret_copy;

+ 17 - 0
src/datawizard/copy-driver.h

@@ -28,6 +28,23 @@
 
 struct starpu_data_state_t;
 
+struct copy_data_methods_s {
+	/* src type is ram */
+	int (*ram_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*ram_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*ram_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+
+	/* src type is cuda */
+	int (*cuda_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*cuda_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*cuda_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+
+	/* src type is spu */
+	int (*spu_to_ram)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*spu_to_cuda)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+	int (*spu_to_spu)(struct starpu_data_state_t *state, uint32_t src, uint32_t dst);
+};
+
 __attribute__((warn_unused_result))
 int driver_copy_data(struct starpu_data_state_t *state, 
 			uint32_t src_node_mask,

+ 28 - 72
src/datawizard/interfaces/bcsr_interface.c

@@ -29,17 +29,37 @@
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
+
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#ifdef USE_CUDA
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#endif
+
+static const struct copy_data_methods_s bcsr_copy_data_methods_s = {
+	.ram_to_ram = dummy_copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef USE_CUDA
+	.ram_to_cuda = copy_ram_to_cublas,
+	.cuda_to_ram = copy_cublas_to_ram,
+#endif
+	.cuda_to_cuda = NULL,
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
 size_t allocate_bcsr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
 void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
 size_t dump_bcsr_interface(starpu_data_interface_t *interface, void *_buffer);
-int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
 size_t bcsr_interface_get_size(struct starpu_data_state_t *state);
 uint32_t footprint_bcsr_interface_crc32(data_state *state, uint32_t hstate);
 
 struct data_interface_ops_t interface_bcsr_ops = {
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
 	.liberate_data_on_node = liberate_bcsr_buffer_on_node,
-	.copy_data_1_to_1 = do_copy_bcsr_buffer_1_to_1,
+	.copy_methods = &bcsr_copy_data_methods_s,
 	.dump_data_interface = dump_bcsr_interface,
 	.get_size = bcsr_interface_get_size,
 	.interfaceid = BCSR_INTERFACE,
@@ -339,7 +359,7 @@ void liberate_bcsr_buffer_on_node(starpu_data_interface_t *interface, uint32_t n
 }
 
 #ifdef USE_CUDA
-static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_bcsr_interface_t *src_bcsr;
 	starpu_bcsr_interface_t *dst_bcsr;
@@ -365,9 +385,10 @@ static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_n
 	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
+	return 0;
 }
 
-static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_bcsr_interface_t *src_bcsr;
 	starpu_bcsr_interface_t *dst_bcsr;
@@ -392,11 +413,13 @@ static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_n
 						(uint8_t *)dst_bcsr->rowptr, 1);
 	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
 }
 #endif // USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
-static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 
 	starpu_bcsr_interface_t *src_bcsr;
@@ -419,73 +442,6 @@ static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t sr
 	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
 
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
-}
-
-
-int do_copy_bcsr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
-{
-	node_kind src_kind = get_node_kind(src_node);
-	node_kind dst_kind = get_node_kind(dst_node);
-
-	switch (dst_kind) {
-	case RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> RAM */
-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
-				 break;
-#ifdef USE_CUDA
-			case CUDA_RAM:
-				/* CUBLAS_RAM -> RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				if (get_local_memory_node() == src_node)
-				{
-					copy_cublas_to_ram(state, src_node, dst_node);
-				}
-				else
-				{
-					post_data_request(state, src_node, dst_node);
-				}
-				break;
-#endif
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO
-				break;
-			case UNUSED:
-				printf("error node %d UNUSED\n", src_node);
-			default:
-				assert(0);
-				break;
-		}
-		break;
-#ifdef USE_CUDA
-	case CUDA_RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> CUBLAS_RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				STARPU_ASSERT(get_local_memory_node() == dst_node);
-				copy_ram_to_cublas(state, src_node, dst_node);
-				break;
-			case CUDA_RAM:
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO 
-				break;
-			case UNUSED:
-			default:
-				STARPU_ASSERT(0);
-				break;
-		}
-		break;
-#endif
-	case SPU_LS:
-		STARPU_ASSERT(0); // TODO
-		break;
-	case UNUSED:
-	default:
-		assert(0);
-		break;
-	}
 
 	return 0;
 }

+ 29 - 76
src/datawizard/interfaces/blas_interface.c

@@ -28,9 +28,28 @@
 #include <cuda.h>
 #endif
 
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#ifdef USE_CUDA
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#endif
+
+static const struct copy_data_methods_s blas_copy_data_methods_s = {
+	.ram_to_ram = dummy_copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef USE_CUDA
+	.ram_to_cuda = copy_ram_to_cublas,
+	.cuda_to_ram = copy_cublas_to_ram,
+#endif
+	.cuda_to_cuda = NULL,
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
 size_t allocate_blas_buffer_on_node(data_state *state, uint32_t dst_node);
 void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
-int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
 size_t dump_blas_interface(starpu_data_interface_t *interface, void *buffer);
 size_t blas_interface_get_size(struct starpu_data_state_t *state);
 uint32_t footprint_blas_interface_crc32(data_state *state, uint32_t hstate);
@@ -42,7 +61,7 @@ int convert_blas_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, go
 struct data_interface_ops_t interface_blas_ops = {
 	.allocate_data_on_node = allocate_blas_buffer_on_node,
 	.liberate_data_on_node = liberate_blas_buffer_on_node,
-	.copy_data_1_to_1 = do_copy_blas_buffer_1_to_1,
+	.copy_methods = &blas_copy_data_methods_s,
 	.dump_data_interface = dump_blas_interface,
 	.get_size = blas_interface_get_size,
 	.footprint = footprint_blas_interface_crc32,
@@ -279,7 +298,7 @@ void liberate_blas_buffer_on_node(starpu_data_interface_t *interface, uint32_t n
 }
 
 #ifdef USE_CUDA
-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;
@@ -292,10 +311,13 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 		(uint8_t *)dst_blas->ptr, dst_blas->ld);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
+
+	return 0;
 }
 
-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
+
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;
 
@@ -308,11 +330,13 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 		(uint8_t *)dst_blas->ptr, dst_blas->ld);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_blas->nx*src_blas->ny*src_blas->elemsize);
+
+	return 0;
 }
 #endif // USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	unsigned y;
 	uint32_t nx = state->interface[dst_node].blas.nx;
@@ -336,77 +360,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 	}
 
 	TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
-}
-
-
-int do_copy_blas_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
-{
-	node_kind src_kind = get_node_kind(src_node);
-	node_kind dst_kind = get_node_kind(dst_node);
-
-	switch (dst_kind) {
-	case RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> RAM */
-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
-				 break;
-#ifdef USE_CUDA
-			case CUDA_RAM:
-				/* CUBLAS_RAM -> RAM */
-				if (get_local_memory_node() == src_node)
-				{
-					/* only the proper CUBLAS thread can initiate this directly ! */
-					copy_cublas_to_ram(state, src_node, dst_node);
-				}
-				else
-				{
-					/* put a request to the corresponding GPU */
-		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
-					post_data_request(state, src_node, dst_node);
-		//			fprintf(stderr, "post %p OK\n", state);
-				}
-				break;
-#endif
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO
-				break;
-			case UNUSED:
-				printf("error node %d UNUSED\n", src_node);
-			default:
-				assert(0);
-				break;
-		}
-		break;
-#ifdef USE_CUDA
-	case CUDA_RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> CUBLAS_RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				STARPU_ASSERT(get_local_memory_node() == dst_node);
-				copy_ram_to_cublas(state, src_node, dst_node);
-				break;
-			case CUDA_RAM:
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO 
-				break;
-			case UNUSED:
-			default:
-				STARPU_ASSERT(0);
-				break;
-		}
-		break;
-#endif
-	case SPU_LS:
-		STARPU_ASSERT(0); // TODO
-		break;
-	case UNUSED:
-	default:
-		assert(0);
-		break;
-	}
 
 	return 0;
 }
-

+ 29 - 76
src/datawizard/interfaces/block_interface.c

@@ -28,9 +28,29 @@
 #include <cuda.h>
 #endif
 
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#ifdef USE_CUDA
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#endif
+
+static const struct copy_data_methods_s block_copy_data_methods_s = {
+	.ram_to_ram = dummy_copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef USE_CUDA
+	.ram_to_cuda = copy_ram_to_cublas,
+	.cuda_to_ram = copy_cublas_to_ram,
+#endif
+	.cuda_to_cuda = NULL,
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
+
 size_t allocate_block_buffer_on_node(data_state *state, uint32_t dst_node);
 void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
-int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
 size_t dump_block_interface(starpu_data_interface_t *interface, void *buffer);
 size_t block_interface_get_size(struct starpu_data_state_t *state);
 uint32_t footprint_block_interface_crc32(data_state *state, uint32_t hstate);
@@ -42,7 +62,7 @@ int convert_block_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, g
 struct data_interface_ops_t interface_block_ops = {
 	.allocate_data_on_node = allocate_block_buffer_on_node,
 	.liberate_data_on_node = liberate_block_buffer_on_node,
-	.copy_data_1_to_1 = do_copy_block_buffer_1_to_1,
+	.copy_methods = &block_copy_data_methods_s,
 	.dump_data_interface = dump_block_interface,
 	.get_size = block_interface_get_size,
 	.footprint = footprint_block_interface_crc32,
@@ -296,7 +316,7 @@ void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t
 }
 
 #ifdef USE_CUDA
-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_block_interface_t *src_block;
 	starpu_block_interface_t *dst_block;
@@ -326,9 +346,11 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 	}
 
 	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
+
+	return 0;
 }
 
-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_block_interface_t *src_block;
 	starpu_block_interface_t *dst_block;
@@ -358,11 +380,13 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 	}
 
 	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+
+	return 0;
 }
 #endif // USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	uint32_t nx = state->interface[dst_node].block.nx;
 	uint32_t ny = state->interface[dst_node].block.ny;
@@ -389,77 +413,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 	}
 
 	TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
-}
-
-
-int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
-{
-	node_kind src_kind = get_node_kind(src_node);
-	node_kind dst_kind = get_node_kind(dst_node);
-
-	switch (dst_kind) {
-	case RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> RAM */
-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
-				 break;
-#ifdef USE_CUDA
-			case CUDA_RAM:
-				/* CUBLAS_RAM -> RAM */
-				if (get_local_memory_node() == src_node)
-				{
-					/* only the proper CUBLAS thread can initiate this directly ! */
-					copy_cublas_to_ram(state, src_node, dst_node);
-				}
-				else
-				{
-					/* put a request to the corresponding GPU */
-		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
-					post_data_request(state, src_node, dst_node);
-		//			fprintf(stderr, "post %p OK\n", state);
-				}
-				break;
-#endif
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO
-				break;
-			case UNUSED:
-				printf("error node %d UNUSED\n", src_node);
-			default:
-				assert(0);
-				break;
-		}
-		break;
-#ifdef USE_CUDA
-	case CUDA_RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> CUBLAS_RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				STARPU_ASSERT(get_local_memory_node() == dst_node);
-				copy_ram_to_cublas(state, src_node, dst_node);
-				break;
-			case CUDA_RAM:
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO 
-				break;
-			case UNUSED:
-			default:
-				STARPU_ASSERT(0);
-				break;
-		}
-		break;
-#endif
-	case SPU_LS:
-		STARPU_ASSERT(0); // TODO
-		break;
-	case UNUSED:
-	default:
-		assert(0);
-		break;
-	}
 
 	return 0;
 }
-

+ 28 - 72
src/datawizard/interfaces/csr_interface.c

@@ -27,17 +27,37 @@
 #include <cuda.h>
 #endif
 
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#ifdef USE_CUDA
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#endif
+
+static const struct copy_data_methods_s csr_copy_data_methods_s = {
+	.ram_to_ram = dummy_copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef USE_CUDA
+	.ram_to_cuda = copy_ram_to_cublas,
+	.cuda_to_ram = copy_cublas_to_ram,
+#endif
+	.cuda_to_cuda = NULL,
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
+
 size_t allocate_csr_buffer_on_node(struct starpu_data_state_t *state, uint32_t dst_node);
 void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
 size_t dump_csr_interface(starpu_data_interface_t *interface, void *_buffer);
-int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
 size_t csr_interface_get_size(struct starpu_data_state_t *state);
 uint32_t footprint_csr_interface_crc32(data_state *state, uint32_t hstate);
 
 struct data_interface_ops_t interface_csr_ops = {
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
 	.liberate_data_on_node = liberate_csr_buffer_on_node,
-	.copy_data_1_to_1 = do_copy_csr_buffer_1_to_1,
+	.copy_methods = &csr_copy_data_methods_s,
 	.dump_data_interface = dump_csr_interface,
 	.get_size = csr_interface_get_size,
 	.interfaceid = CSR_INTERFACE,
@@ -308,7 +328,7 @@ void liberate_csr_buffer_on_node(starpu_data_interface_t *interface, uint32_t no
 }
 
 #ifdef USE_CUDA
-static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_csr_interface_t *src_csr;
 	starpu_csr_interface_t *dst_csr;
@@ -331,9 +351,10 @@ static void copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_n
 	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
+	return 0;
 }
 
-static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_csr_interface_t *src_csr;
 	starpu_csr_interface_t *dst_csr;
@@ -355,11 +376,13 @@ static void copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_n
 						(uint8_t *)dst_csr->rowptr, 1);
 	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+
+	return 0;
 }
 #endif // USE_CUDA
 
 /* as not all platform easily have a BLAS lib installed ... */
-static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
 {
 
 	starpu_csr_interface_t *src_csr;
@@ -379,73 +402,6 @@ static void dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t sr
 	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
 
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-}
-
-
-int do_copy_csr_buffer_1_to_1(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node)
-{
-	node_kind src_kind = get_node_kind(src_node);
-	node_kind dst_kind = get_node_kind(dst_node);
-
-	switch (dst_kind) {
-	case RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> RAM */
-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
-				 break;
-#ifdef USE_CUDA
-			case CUDA_RAM:
-				/* CUBLAS_RAM -> RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				if (get_local_memory_node() == src_node)
-				{
-					copy_cublas_to_ram(state, src_node, dst_node);
-				}
-				else
-				{
-					post_data_request(state, src_node, dst_node);
-				}
-				break;
-#endif
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO
-				break;
-			case UNUSED:
-				printf("error node %d UNUSED\n", src_node);
-			default:
-				assert(0);
-				break;
-		}
-		break;
-#ifdef USE_CUDA
-	case CUDA_RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> CUBLAS_RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				STARPU_ASSERT(get_local_memory_node() == dst_node);
-				copy_ram_to_cublas(state, src_node, dst_node);
-				break;
-			case CUDA_RAM:
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO 
-				break;
-			case UNUSED:
-			default:
-				STARPU_ASSERT(0);
-				break;
-		}
-		break;
-#endif
-	case SPU_LS:
-		STARPU_ASSERT(0); // TODO
-		break;
-	case UNUSED:
-	default:
-		assert(0);
-		break;
-	}
 
 	return 0;
 }

+ 1 - 2
src/datawizard/interfaces/data_interface.h

@@ -42,8 +42,7 @@ struct data_interface_ops_t {
 					uint32_t node);
 	void (*liberate_data_on_node)(starpu_data_interface_t *interface,
 					uint32_t node);
-	int (*copy_data_1_to_1)(struct starpu_data_state_t *state, 
-					uint32_t src, uint32_t dst);
+	const struct copy_data_methods_s *copy_methods;
 	size_t (*dump_data_interface)(starpu_data_interface_t *interface, 
 					void *buffer);
 	size_t (*get_size)(struct starpu_data_state_t *state);

+ 28 - 74
src/datawizard/interfaces/vector_interface.c

@@ -28,9 +28,28 @@
 #include <cuda.h>
 #endif
 
+static int dummy_copy_ram_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#ifdef USE_CUDA
+static int copy_ram_to_cublas(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+static int copy_cublas_to_ram(struct starpu_data_state_t *state, uint32_t src_node, uint32_t dst_node);
+#endif
+
+static const struct copy_data_methods_s vector_copy_data_methods_s = {
+	.ram_to_ram = dummy_copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef USE_CUDA
+	.ram_to_cuda = copy_ram_to_cublas,
+	.cuda_to_ram = copy_cublas_to_ram,
+#endif
+	.cuda_to_cuda = NULL,
+	.cuda_to_spu = NULL,
+	.spu_to_ram = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu = NULL
+};
+
 size_t allocate_vector_buffer_on_node(data_state *state, uint32_t dst_node);
 void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
-int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
 size_t dump_vector_interface(starpu_data_interface_t *interface, void *buffer);
 size_t vector_interface_get_size(struct starpu_data_state_t *state);
 uint32_t footprint_vector_interface_crc32(data_state *state, uint32_t hstate);
@@ -42,7 +61,7 @@ int convert_vector_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr,
 struct data_interface_ops_t interface_vector_ops = {
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
 	.liberate_data_on_node = liberate_vector_buffer_on_node,
-	.copy_data_1_to_1 = do_copy_vector_buffer_1_to_1,
+	.copy_methods = &vector_copy_data_methods_s,
 	.dump_data_interface = dump_vector_interface,
 	.get_size = vector_interface_get_size,
 	.footprint = footprint_vector_interface_crc32,
@@ -223,7 +242,7 @@ void liberate_vector_buffer_on_node(starpu_data_interface_t *interface, uint32_t
 }
 
 #ifdef USE_CUDA
-static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;
@@ -236,9 +255,11 @@ static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t ds
 		(uint8_t *)dst_vector->ptr, 1);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
 }
 
-static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;
@@ -251,10 +272,12 @@ static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t ds
 		(uint8_t *)dst_vector->ptr, 1);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+
+	return 0;
 }
 #endif // USE_CUDA
 
-static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+static int dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
 {
 	uint32_t nx = state->interface[dst_node].vector.nx;
 	size_t elemsize = state->interface[dst_node].vector.elemsize;
@@ -265,75 +288,6 @@ static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t
 	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
 
 	TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
-}
-
-int do_copy_vector_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
-{
-	node_kind src_kind = get_node_kind(src_node);
-	node_kind dst_kind = get_node_kind(dst_node);
-
-	switch (dst_kind) {
-	case RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> RAM */
-				 dummy_copy_ram_to_ram(state, src_node, dst_node);
-				 break;
-#ifdef USE_CUDA
-			case CUDA_RAM:
-				/* CUBLAS_RAM -> RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				if (get_local_memory_node() == src_node)
-				{
-					/* only the proper CUBLAS thread can initiate this directly ! */
-					copy_cublas_to_ram(state, src_node, dst_node);
-				}
-				else
-				{
-					/* put a request to the corresponding GPU */
-					post_data_request(state, src_node, dst_node);
-				}
-				break;
-#endif
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO
-				break;
-			case UNUSED:
-				printf("error node %d UNUSED\n", src_node);
-			default:
-				assert(0);
-				break;
-		}
-		break;
-#ifdef USE_CUDA
-	case CUDA_RAM:
-		switch (src_kind) {
-			case RAM:
-				/* RAM -> CUBLAS_RAM */
-				/* only the proper CUBLAS thread can initiate this ! */
-				STARPU_ASSERT(get_local_memory_node() == dst_node);
-				copy_ram_to_cublas(state, src_node, dst_node);
-				break;
-			case CUDA_RAM:
-			case SPU_LS:
-				STARPU_ASSERT(0); // TODO 
-				break;
-			case UNUSED:
-			default:
-				STARPU_ASSERT(0);
-				break;
-		}
-		break;
-#endif
-	case SPU_LS:
-		STARPU_ASSERT(0); // TODO
-		break;
-	case UNUSED:
-	default:
-		assert(0);
-		break;
-	}
 
 	return 0;
 }
-