Browse Source

implement a data interface for 3D dense blocks

Cédric Augonnet 16 years ago
parent
commit
d482a9effe

+ 22 - 0
include/starpu-data-interfaces.h

@@ -37,6 +37,27 @@ uint32_t starpu_get_blas_ny(starpu_data_handle handle);
 uint32_t starpu_get_blas_local_ld(starpu_data_handle handle);
 uintptr_t starpu_get_blas_local_ptr(starpu_data_handle handle);
 
+/* BLOCK interface for 3D dense blocks */
+typedef struct starpu_block_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	uint32_t ny;
+	uint32_t nz;
+	uint32_t ldy;	/* number of elements between two lines */
+	uint32_t ldz;	/* number of elements between two planes */
+	size_t elemsize;
+} starpu_block_interface_t;
+
+void starpu_register_block_data(starpu_data_handle *handle, uint32_t home_node,
+                        uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
+                        uint32_t ny, uint32_t nz, size_t elemsize);
+uint32_t starpu_get_block_nx(starpu_data_handle handle);
+uint32_t starpu_get_block_ny(starpu_data_handle handle);
+uint32_t starpu_get_block_nz(starpu_data_handle handle);
+uint32_t starpu_get_block_local_ldy(starpu_data_handle handle);
+uint32_t starpu_get_block_local_ldz(starpu_data_handle handle);
+uintptr_t starpu_get_block_local_ptr(starpu_data_handle handle);
+
 /* vector interface for contiguous (non-strided) buffers */
 typedef struct starpu_vector_interface_s {
 	uintptr_t ptr;
@@ -123,6 +144,7 @@ uint32_t starpu_get_bcsr_c(starpu_data_handle);
 
 typedef union {
 	starpu_blas_interface_t blas;	/* dense BLAS representation */
+	starpu_block_interface_t block;	/* BLOCK interface for 3D dense blocks */
 	starpu_vector_interface_t vector; /* continuous vector */
 	starpu_csr_interface_t csr;	/* compressed sparse row */
 	starpu_csc_interface_t csc; 	/* compressed sparse column */

+ 2 - 0
src/Makefile.am

@@ -61,6 +61,7 @@ noinst_HEADERS = 						\
 	datawizard/coherency.h					\
 	datawizard/memory_nodes.h				\
 	datawizard/interfaces/blas_interface.h			\
+	datawizard/interfaces/block_interface.h			\
 	datawizard/interfaces/csr_filters.h			\
 	datawizard/interfaces/csc_interface.h			\
 	datawizard/interfaces/bcsr_filters.h			\
@@ -124,6 +125,7 @@ libstarpu_la_SOURCES = 						\
 	datawizard/interfaces/csr_interface.c			\
 	datawizard/interfaces/blas_filters.c			\
 	datawizard/interfaces/blas_interface.c			\
+	datawizard/interfaces/block_interface.c			\
 	datawizard/interfaces/vector_interface.c		\
 	datawizard/interfaces/bcsr_filters.c			\
 	datawizard/interfaces/csr_filters.c			\

+ 465 - 0
src/datawizard/interfaces/block_interface.c

@@ -0,0 +1,465 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+#include <datawizard/data_parameters.h>
+#include <datawizard/coherency.h>
+#include <datawizard/copy-driver.h>
+#include <datawizard/hierarchy.h>
+
+#include <common/hash.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+size_t allocate_block_buffer_on_node(data_state *state, uint32_t dst_node);
+void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t node);
+int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node);
+size_t dump_block_interface(starpu_data_interface_t *interface, void *buffer);
+size_t block_interface_get_size(struct starpu_data_state_t *state);
+uint32_t footprint_block_interface_crc32(data_state *state, uint32_t hstate);
+void display_block_interface(data_state *state, FILE *f);
+#ifdef USE_GORDON
+int convert_block_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss);
+#endif
+
+struct data_interface_ops_t interface_block_ops = {
+	.allocate_data_on_node = allocate_block_buffer_on_node,
+	.liberate_data_on_node = liberate_block_buffer_on_node,
+	.copy_data_1_to_1 = do_copy_block_buffer_1_to_1,
+	.dump_data_interface = dump_block_interface,
+	.get_size = block_interface_get_size,
+	.footprint = footprint_block_interface_crc32,
+#ifdef USE_GORDON
+	.convert_to_gordon = convert_block_to_gordon,
+#endif
+	.interfaceid = BLAS_INTERFACE, 
+	.display = display_block_interface
+};
+
+#ifdef USE_GORDON
+int convert_block_to_gordon(starpu_data_interface_t *interface, uint64_t *ptr, gordon_strideSize_t *ss) 
+{
+	/* TODO */
+	STARPU_ASSERT(0);
+
+	return 0;
+}
+#endif
+
+/* declare a new data with the BLAS interface */
+void starpu_register_block_data(struct starpu_data_state_t **handle, uint32_t home_node,
+			uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
+			uint32_t ny, uint32_t nz, size_t elemsize)
+{
+	struct starpu_data_state_t *state = calloc(1, sizeof(struct starpu_data_state_t));
+	STARPU_ASSERT(state);
+
+	STARPU_ASSERT(handle);
+	*handle = state;
+
+	unsigned node;
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_block_interface_t *local_interface = &state->interface[node].block;
+
+		if (node == home_node) {
+			local_interface->ptr = ptr;
+			local_interface->ldy  = ldy;
+			local_interface->ldz  = ldz;
+		}
+		else {
+			local_interface->ptr = 0;
+			local_interface->ldy  = 0;
+			local_interface->ldz  = 0;
+		}
+
+		local_interface->nx = nx;
+		local_interface->ny = ny;
+		local_interface->nz = nz;
+		local_interface->elemsize = elemsize;
+	}
+
+	state->ops = &interface_block_ops;
+
+	register_new_data(state, home_node, 0);
+}
+
+static inline uint32_t footprint_block_interface_generic(uint32_t (*hash_func)(uint32_t input, uint32_t hstate), data_state *state, uint32_t hstate)
+{
+	uint32_t hash;
+
+	hash = hstate;
+	hash = hash_func(starpu_get_block_nx(state), hash);
+	hash = hash_func(starpu_get_block_ny(state), hash);
+	hash = hash_func(starpu_get_block_nz(state), hash);
+
+	return hash;
+}
+
+uint32_t footprint_block_interface_crc32(data_state *state, uint32_t hstate)
+{
+	return footprint_block_interface_generic(crc32_be, state, hstate);
+}
+
+struct dumped_block_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	uint32_t ny;
+	uint32_t nz;
+	uint32_t ldy;
+	uint32_t ldz;
+} __attribute__ ((packed));
+
+void display_block_interface(data_state *state, FILE *f)
+{
+	starpu_block_interface_t *interface;
+
+	interface = &state->interface[0].block;
+
+	fprintf(f, "%d\t%d\t%d\t", interface->nx, interface->ny, interface->nz);
+}
+
+size_t dump_block_interface(starpu_data_interface_t *interface, void *_buffer)
+{
+	/* yes, that's DIRTY ... */
+	struct dumped_block_interface_s *buffer = _buffer;
+
+	buffer->ptr = (*interface).block.ptr;
+	buffer->nx = (*interface).block.nx;
+	buffer->ny = (*interface).block.ny;
+	buffer->nz = (*interface).block.nz;
+	buffer->ldy = (*interface).block.ldy;
+	buffer->ldz = (*interface).block.ldz;
+
+	return (sizeof(struct dumped_block_interface_s));
+}
+
+size_t block_interface_get_size(struct starpu_data_state_t *state)
+{
+	size_t size;
+	starpu_block_interface_t *interface;
+
+	interface = &state->interface[0].block;
+
+	size = interface->nx*interface->ny*interface->nz*interface->elemsize; 
+
+	return size;
+}
+
+/* offer an access to the data parameters */
+uint32_t starpu_get_block_nx(data_state *state)
+{
+	return (state->interface[0].block.nx);
+}
+
+uint32_t starpu_get_block_ny(data_state *state)
+{
+	return (state->interface[0].block.ny);
+}
+
+uint32_t starpu_get_block_nz(data_state *state)
+{
+	return (state->interface[0].block.nz);
+}
+
+uint32_t starpu_get_block_local_ldy(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].block.ldy);
+}
+
+uint32_t starpu_get_block_local_ldz(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].block.ldz);
+}
+
+uintptr_t starpu_get_block_local_ptr(data_state *state)
+{
+	unsigned node;
+	node = get_local_memory_node();
+
+	STARPU_ASSERT(state->per_node[node].allocated);
+
+	return (state->interface[node].block.ptr);
+}
+
+/* memory allocation/deallocation primitives for the BLAS interface */
+
+/* returns the size of the allocated area */
+size_t allocate_block_buffer_on_node(data_state *state, uint32_t dst_node)
+{
+	uintptr_t addr = 0;
+	unsigned fail = 0;
+	size_t allocated_memory;
+
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+	uint32_t nx = state->interface[dst_node].block.nx;
+	uint32_t ny = state->interface[dst_node].block.ny;
+	uint32_t nz = state->interface[dst_node].block.nz;
+	size_t elemsize = state->interface[dst_node].block.elemsize;
+
+	node_kind kind = get_node_kind(dst_node);
+
+	switch(kind) {
+		case RAM:
+			addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
+			if (!addr) 
+				fail = 1;
+
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			status = cublasAlloc(nx*ny*nz, elemsize, (void **)&addr);
+
+			if (!addr || status != CUBLAS_STATUS_SUCCESS)
+			{
+				STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
+				STARPU_ASSERT(status != CUBLAS_STATUS_NOT_INITIALIZED);
+				STARPU_ASSERT(status != CUBLAS_STATUS_INVALID_VALUE);
+				STARPU_ASSERT(status == CUBLAS_STATUS_ALLOC_FAILED);
+				fail = 1;
+			}
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+
+	if (!fail) {
+		/* allocation succeeded */
+		allocated_memory = nx*ny*nz*elemsize;
+
+		/* update the data properly in consequence */
+		state->interface[dst_node].block.ptr = addr;
+		state->interface[dst_node].block.ldy = nx;
+		state->interface[dst_node].block.ldz = nx*ny;
+	} else {
+		/* allocation failed */
+		allocated_memory = 0;
+	}
+	
+	return allocated_memory;
+}
+
+void liberate_block_buffer_on_node(starpu_data_interface_t *interface, uint32_t node)
+{
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+
+	node_kind kind = get_node_kind(node);
+	switch(kind) {
+		case RAM:
+			free((void*)interface->block.ptr);
+			break;
+#ifdef USE_CUDA
+		case CUDA_RAM:
+			status = cublasFree((void*)interface->block.ptr);
+			
+			STARPU_ASSERT(status != CUBLAS_STATUS_INTERNAL_ERROR);
+			STARPU_ASSERT(status == CUBLAS_STATUS_SUCCESS);
+
+			break;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+#ifdef USE_CUDA
+static void copy_cublas_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = &state->interface[src_node].block;
+	dst_block = &state->interface[dst_node].block;
+
+	if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
+	{
+		/* we are lucky */
+		cublasGetMatrix(src_block->nx*src_block->ny, src_block->nz, src_block->elemsize,
+			(uint8_t *)src_block->ptr, src_block->ldz,
+			(uint8_t *)dst_block->ptr, dst_block->ldz);
+	}
+	else {
+		unsigned layer;
+		for (layer = 0; layer < src_block->nz; layer++)
+		{
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
+						+ src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
+						+ dst_block->ldz*dst_block->elemsize;
+
+			cublasGetMatrix(src_block->nx, src_block->ny, src_block->elemsize,
+				src_ptr, src_block->ldy, dst_ptr, dst_block->ldy);
+		}
+	}
+
+	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
+}
+
+static void copy_ram_to_cublas(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	starpu_block_interface_t *src_block;
+	starpu_block_interface_t *dst_block;
+
+	src_block = &state->interface[src_node].block;
+	dst_block = &state->interface[dst_node].block;
+
+	if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
+	{
+		/* we are lucky */
+		cublasSetMatrix(src_block->nx*src_block->ny, src_block->nz, src_block->elemsize,
+			(uint8_t *)src_block->ptr, src_block->ldz,
+			(uint8_t *)dst_block->ptr, dst_block->ldz);
+	}
+	else {
+		unsigned layer;
+		for (layer = 0; layer < src_block->nz; layer++)
+		{
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
+						+ src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
+						+ dst_block->ldz*dst_block->elemsize;
+
+			cublasSetMatrix(src_block->nx, src_block->ny, src_block->elemsize,
+				src_ptr, src_block->ldy, dst_ptr, dst_block->ldy);
+		}
+	}
+
+	TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+}
+#endif // USE_CUDA
+
+/* as not all platform easily have a BLAS lib installed ... */
+static void dummy_copy_ram_to_ram(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	uint32_t nx = state->interface[dst_node].block.nx;
+	uint32_t ny = state->interface[dst_node].block.ny;
+	uint32_t nz = state->interface[dst_node].block.nz;
+	size_t elemsize = state->interface[dst_node].block.elemsize;
+
+	uint32_t ldy_src = state->interface[src_node].block.ldy;
+	uint32_t ldz_src = state->interface[src_node].block.ldz;
+	uint32_t ldy_dst = state->interface[dst_node].block.ldy;
+	uint32_t ldz_dst = state->interface[dst_node].block.ldz;
+
+	uintptr_t ptr_src = state->interface[src_node].block.ptr;
+	uintptr_t ptr_dst = state->interface[dst_node].block.ptr;
+
+	unsigned y, z;
+	for (z = 0; z < nz; z++)
+	for (y = 0; y < ny; y++)
+	{
+		uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
+		uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
+
+		memcpy((void *)(ptr_dst + dst_offset), 
+			(void *)(ptr_src + src_offset), nx*elemsize);
+	}
+
+	TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+}
+
+
+int do_copy_block_buffer_1_to_1(data_state *state, uint32_t src_node, uint32_t dst_node)
+{
+	node_kind src_kind = get_node_kind(src_node);
+	node_kind dst_kind = get_node_kind(dst_node);
+
+	switch (dst_kind) {
+	case RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> RAM */
+				 dummy_copy_ram_to_ram(state, src_node, dst_node);
+				 break;
+#ifdef USE_CUDA
+			case CUDA_RAM:
+				/* CUBLAS_RAM -> RAM */
+				if (get_local_memory_node() == src_node)
+				{
+					/* only the proper CUBLAS thread can initiate this directly ! */
+					copy_cublas_to_ram(state, src_node, dst_node);
+				}
+				else
+				{
+					/* put a request to the corresponding GPU */
+		//			fprintf(stderr, "post_data_request state %p src %d dst %d\n", state, src_node, dst_node);
+					post_data_request(state, src_node, dst_node);
+		//			fprintf(stderr, "post %p OK\n", state);
+				}
+				break;
+#endif
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO
+				break;
+			case UNUSED:
+				printf("error node %d UNUSED\n", src_node);
+			default:
+				assert(0);
+				break;
+		}
+		break;
+#ifdef USE_CUDA
+	case CUDA_RAM:
+		switch (src_kind) {
+			case RAM:
+				/* RAM -> CUBLAS_RAM */
+				/* only the proper CUBLAS thread can initiate this ! */
+				STARPU_ASSERT(get_local_memory_node() == dst_node);
+				copy_ram_to_cublas(state, src_node, dst_node);
+				break;
+			case CUDA_RAM:
+			case SPU_LS:
+				STARPU_ASSERT(0); // TODO 
+				break;
+			case UNUSED:
+			default:
+				STARPU_ASSERT(0);
+				break;
+		}
+		break;
+#endif
+	case SPU_LS:
+		STARPU_ASSERT(0); // TODO
+		break;
+	case UNUSED:
+	default:
+		assert(0);
+		break;
+	}
+
+	return 0;
+}
+

+ 24 - 0
src/datawizard/interfaces/block_interface.h

@@ -0,0 +1,24 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLOCK_INTERFACE_H__
+#define __BLOCK_INTERFACE_H__
+
+#include <stdint.h>
+
+#define BLOCK_INTERFACE   0x118512
+
+#endif // __BLOCK_INTERFACE_H__