12 years ago · 0e385974a9
--- a/ChangeLog
+++ b/ChangeLog
@@ -50,6 +50,8 @@ New features:
 
																   * Tasks can now define a optional prologue callback which is executed
															
 
																     on the host when the task becomes ready for execution, before getting
															
 
																     scheduled.
															
 
																+  * Small CUDA allocations (<= 4MiB) are now batched to avoid the huge
															
 
																+    cudaMalloc overhead.
															
 
																 Small features:
															
 
																   * New functions starpu_data_acquire_cb_sequential_consistency() and
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -88,6 +88,7 @@ noinst_HEADERS = 						\
 
																 	datawizard/filters.h					\
															
 
																 	datawizard/write_back.h					\
															
 
																 	datawizard/datastats.h					\
															
 
																+	datawizard/malloc.h					\
															
 
																 	datawizard/memstats.h					\
															
 
																 	datawizard/memory_manager.h				\
															
 
																 	datawizard/memalloc.h					\
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -27,6 +27,7 @@
 
																 #include <core/debug.h>
															
 
																 #include <core/disk.h>
															
 
																 #include <core/task.h>
															
 
																+#include <datawizard/malloc.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <starpu_task_list.h>
															
 
																 #include <drivers/mp_common/sink_common.h>
															
@@ -996,6 +997,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
																 	_starpu_data_interface_init();
															
 
																+	_starpu_malloc_init();
															
 
																+
															
 
																 	_starpu_timing_init();
															
 
																 	_starpu_profiling_init();
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -354,8 +354,8 @@ static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZE
 
																 static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																 #endif
															
 
																-uintptr_t
															
 
																-starpu_malloc_on_node(unsigned dst_node, size_t size)
															
 
																+static uintptr_t
															
 
																+_starpu_malloc_on_node(unsigned dst_node, size_t size)
															
 
																 {
															
 
																 	uintptr_t addr = 0;
															
@@ -462,7 +462,7 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 
																 }
															
 
																 void
															
 
																-starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
															
 
																+_starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
															
 
																 {
															
 
																 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
															
 
																 	switch(kind)
															
@@ -533,3 +533,301 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
																 }
															
 
																+/*
															
 
																+ * On CUDA which has very expensive malloc, for small sizes, allocate big
															
 
																+ * chunks divided in blocks, and we actually allocate segments of consecutive
															
 
																+ * blocks.
															
 
																+ *
															
 
																+ * We try to keep the list of chunks with increasing occupancy, so we can
															
 
																+ * quickly find free segments to allocate.
															
 
																+ */
															
 
																+
															
 
																+/* Size of each chunk, 32MiB granularity brings 128 chunks to be allocated in
															
 
																+ * order to fill a 4GiB GPU. */
															
 
																+#define CHUNK_SIZE (32*1024*1024)
															
 
																+
															
 
																+/* Maximum segment size we will allocate in chunks */
															
 
																+#define CHUNK_ALLOC_MAX (CHUNK_SIZE / 8)
															
 
																+
															
 
																+/* Granularity of allocation, i.e. block size, StarPU will never allocate less
															
 
																+ * than this.
															
 
																+ * 16KiB (i.e. 64x64 float) granularity eats 2MiB RAM for managing a 4GiB GPU.
															
 
																+ */
															
 
																+#define CHUNK_ALLOC_MIN (16*1024)
															
 
																+
															
 
																+/* Number of blocks */
															
 
																+#define CHUNK_NBLOCKS (CHUNK_SIZE/CHUNK_ALLOC_MIN)
															
 
																+
															
 
																+/* Linked list for available segments */
															
 
																+struct block {
															
 
																+	int length;	/* Number of consecutive free blocks */
															
 
																+	int next;	/* next free segment */
															
 
																+};
															
 
																+
															
 
																+/* One chunk */
															
 
																+LIST_TYPE(_starpu_chunk,
															
 
																+	uintptr_t base;
															
 
																+
															
 
																+	/* Available number of blocks, for debugging */
															
 
																+	int available;
															
 
																+
															
 
																+	/* Overestimation of the maximum size of available segments in this chunk */
															
 
																+	int available_max;
															
 
																+
															
 
																+	/* Bitmap describing availability of the block */
															
 
																+	/* Block 0 is always empty, and is just the head of the free segments list */
															
 
																+	struct block bitmap[CHUNK_NBLOCKS+1];
															
 
																+)
															
 
																+
															
 
																+/* One list of chunks per node */
															
 
																+static struct _starpu_chunk_list *chunks[STARPU_MAXNODES];
															
 
																+/* Number of completely free chunks */
															
 
																+static int nfreechunks[STARPU_MAXNODES];
															
 
																+/* This protects chunks and nfreechunks */
															
 
																+static starpu_pthread_mutex_t chunk_mutex[STARPU_MAXNODES];
															
 
																+
															
 
																+void
															
 
																+_starpu_malloc_init(void)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 0; i < STARPU_MAXNODES; i++)
															
 
																+	{
															
 
																+		chunks[i] = _starpu_chunk_list_new();
															
 
																+		nfreechunks[i] = 0;
															
 
																+		STARPU_PTHREAD_MUTEX_INIT(&chunk_mutex[i], NULL);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+void
															
 
																+_starpu_malloc_shutdown(unsigned dst_node)
															
 
																+{
															
 
																+	struct _starpu_chunk *chunk, *next_chunk;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
															
 
																+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
															
 
																+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
															
 
																+	     chunk = next_chunk)
															
 
																+	{
															
 
																+		next_chunk = _starpu_chunk_list_next(chunk);
															
 
																+		STARPU_ASSERT(chunk->available == CHUNK_NBLOCKS);
															
 
																+		_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
															
 
																+		_starpu_chunk_list_erase(chunks[dst_node], chunk);
															
 
																+		free(chunk);
															
 
																+	}
															
 
																+	_starpu_chunk_list_delete(chunks[dst_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
															
 
																+	STARPU_PTHREAD_MUTEX_DESTROY(&chunk_mutex[dst_node]);
															
 
																+}
															
 
																+
															
 
																+/* Create a new chunk */
															
 
																+static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
															
 
																+{
															
 
																+	struct _starpu_chunk *chunk;
															
 
																+	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE);
															
 
																+
															
 
																+	if (!base)
															
 
																+		return NULL;
															
 
																+
															
 
																+	/* Create a new chunk */
															
 
																+	chunk = _starpu_chunk_new();
															
 
																+	chunk->base = base;
															
 
																+
															
 
																+	/* First block is just a fake block pointing to the free segments list */
															
 
																+	chunk->bitmap[0].length = 0;
															
 
																+	chunk->bitmap[0].next = 1;
															
 
																+
															
 
																+	/* At first we have only one big segment for the whole chunk */
															
 
																+	chunk->bitmap[1].length = CHUNK_NBLOCKS;
															
 
																+	chunk->bitmap[1].next = -1;
															
 
																+
															
 
																+	chunk->available_max = CHUNK_NBLOCKS;
															
 
																+	chunk->available = CHUNK_NBLOCKS;
															
 
																+	return chunk;
															
 
																+}
															
 
																+
															
 
																+uintptr_t
															
 
																+starpu_malloc_on_node(unsigned dst_node, size_t size)
															
 
																+{
															
 
																+	/* Big allocation, allocate normally */
															
 
																+	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
															
 
																+		return _starpu_malloc_on_node(dst_node, size);
															
 
																+
															
 
																+	/* Round up allocation to block size */
															
 
																+	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
															
 
																+
															
 
																+	struct _starpu_chunk *chunk;
															
 
																+	int prevblock, block;
															
 
																+	int available_max;
															
 
																+	struct block *bitmap;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
															
 
																+
															
 
																+	/* Try to find a big enough segment among the chunks */
															
 
																+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
															
 
																+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
															
 
																+	     chunk = _starpu_chunk_list_next(chunk))
															
 
																+	{
															
 
																+		if (chunk->available_max < nblocks)
															
 
																+			continue;
															
 
																+
															
 
																+		bitmap = chunk->bitmap;
															
 
																+		available_max = 0;
															
 
																+		for (prevblock = block = 0;
															
 
																+			block != -1;
															
 
																+			prevblock = block, block = bitmap[prevblock].next)
															
 
																+		{
															
 
																+			STARPU_ASSERT(block >= 0 && block <= CHUNK_NBLOCKS);
															
 
																+			int length = bitmap[block].length;
															
 
																+			if (length >= nblocks) {
															
 
																+
															
 
																+				if (length >= 2*nblocks)
															
 
																+				{
															
 
																+					/* This one this has quite some room,
															
 
																+					 * put it front, to make finding it
															
 
																+					 * easier next time. */
															
 
																+					_starpu_chunk_list_erase(chunks[dst_node], chunk);
															
 
																+					_starpu_chunk_list_push_front(chunks[dst_node], chunk);
															
 
																+				}
															
 
																+				if (chunk->available == CHUNK_NBLOCKS)
															
 
																+					/* This one was empty, it's not empty any more */
															
 
																+					nfreechunks[dst_node]--;
															
 
																+				goto found;
															
 
																+			}
															
 
																+			if (length > available_max)
															
 
																+				available_max = length;
															
 
																+		}
															
 
																+
															
 
																+		/* Didn't find a big enough segment in this chunk, its
															
 
																+		 * available_max is out of date */
															
 
																+		chunk->available_max = available_max;
															
 
																+	}
															
 
																+
															
 
																+	/* Didn't find a big enough segment, create another chunk.  */
															
 
																+	chunk = _starpu_new_chunk(dst_node);
															
 
																+	if (!chunk)
															
 
																+	{
															
 
																+		/* Really no memory any more, fail */
															
 
																+		STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
															
 
																+		errno = ENOMEM;
															
 
																+		return 0;
															
 
																+	}
															
 
																+
															
 
																+	/* And make it easy to find. */
															
 
																+	_starpu_chunk_list_push_front(chunks[dst_node], chunk);
															
 
																+	bitmap = chunk->bitmap;
															
 
																+	prevblock = 0;
															
 
																+	block = 1;
															
 
																+
															
 
																+found:
															
 
																+
															
 
																+	chunk->available -= nblocks;
															
 
																+	STARPU_ASSERT(bitmap[block].length >= nblocks);
															
 
																+	STARPU_ASSERT(block <= CHUNK_NBLOCKS);
															
 
																+	if (bitmap[block].length == nblocks)
															
 
																+	{
															
 
																+		/* Fits exactly, drop this segment from the skip list */
															
 
																+		bitmap[prevblock].next = bitmap[block].next;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Still some room */
															
 
																+		STARPU_ASSERT(block + nblocks <= CHUNK_NBLOCKS);
															
 
																+		bitmap[prevblock].next = block + nblocks;
															
 
																+		bitmap[block + nblocks].length = bitmap[block].length - nblocks;
															
 
																+		bitmap[block + nblocks].next = bitmap[block].next;
															
 
																+	}
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
															
 
																+
															
 
																+	return chunk->base + (block-1) * CHUNK_ALLOC_MIN;
															
 
																+}
															
 
																+
															
 
																+void
															
 
																+starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
															
 
																+{
															
 
																+	/* Big allocation, deallocate normally */
															
 
																+	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
															
 
																+	{
															
 
																+		_starpu_free_on_node(dst_node, addr, size);
															
 
																+		return;
															
 
																+	}
															
 
																+
															
 
																+	struct _starpu_chunk *chunk;
															
 
																+
															
 
																+	/* Round up allocation to block size */
															
 
																+	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
															
 
																+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
															
 
																+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
															
 
																+	     chunk = _starpu_chunk_list_next(chunk))
															
 
																+		if (addr >= chunk->base && addr < chunk->base + CHUNK_SIZE)
															
 
																+			break;
															
 
																+	STARPU_ASSERT(chunk != _starpu_chunk_list_end(chunks[dst_node]));
															
 
																+
															
 
																+	struct block *bitmap = chunk->bitmap;
															
 
																+	int block = ((addr - chunk->base) / CHUNK_ALLOC_MIN) + 1, prevblock, nextblock;
															
 
																+
															
 
																+	/* Look for free segment just before this one */
															
 
																+	for (prevblock = 0;
															
 
																+		prevblock != -1;
															
 
																+		prevblock = nextblock)
															
 
																+	{
															
 
																+		STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
															
 
																+		nextblock = bitmap[prevblock].next;
															
 
																+		if (nextblock > block || nextblock == -1)
															
 
																+			break;
															
 
																+	}
															
 
																+	STARPU_ASSERT(prevblock != -1);
															
 
																+
															
 
																+	chunk->available += nblocks;
															
 
																+
															
 
																+	/* Insert in free segments list */
															
 
																+	bitmap[block].next = nextblock;
															
 
																+	bitmap[prevblock].next = block;
															
 
																+	bitmap[block].length = nblocks;
															
 
																+
															
 
																+	STARPU_ASSERT(nextblock >= -1 && nextblock <= CHUNK_NBLOCKS);
															
 
																+	if (nextblock == block + nblocks)
															
 
																+	{
															
 
																+		/* This freed segment is just before a free segment, merge them */
															
 
																+		bitmap[block].next = bitmap[nextblock].next;
															
 
																+		bitmap[block].length += bitmap[nextblock].length;
															
 
																+
															
 
																+		if (bitmap[block].length > chunk->available_max)
															
 
																+			chunk->available_max = bitmap[block].length;
															
 
																+	}
															
 
																+
															
 
																+	if (prevblock > 0 && prevblock + bitmap[prevblock].length == block)
															
 
																+	{
															
 
																+		/* This free segment is just after a free segment, merge them */
															
 
																+		bitmap[prevblock].next = bitmap[block].next;
															
 
																+		bitmap[prevblock].length += bitmap[block].length;
															
 
																+
															
 
																+		if (bitmap[prevblock].length > chunk->available_max)
															
 
																+			chunk->available_max = bitmap[prevblock].length;
															
 
																+
															
 
																+		block = prevblock;
															
 
																+	}
															
 
																+
															
 
																+	if (chunk->available == CHUNK_NBLOCKS)
															
 
																+	{
															
 
																+		/* This chunk is now empty, but avoid chunk free/alloc
															
 
																+		 * ping-pong by keeping some of these.  */
															
 
																+		if (nfreechunks[dst_node] >= 1) {
															
 
																+			/* We already have free chunks, release this one */
															
 
																+			_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
															
 
																+			_starpu_chunk_list_erase(chunks[dst_node], chunk);
															
 
																+			free(chunk);
															
 
																+		} else
															
 
																+			nfreechunks[dst_node]++;
															
 
																+	}
															
 
																+	else
															
 
																+	{
															
 
																+		/* Freed some room, put this first in chunks list */
															
 
																+		_starpu_chunk_list_erase(chunks[dst_node], chunk);
															
 
																+		_starpu_chunk_list_push_front(chunks[dst_node], chunk);
															
 
																+	}
															
 
																+
															
 
																+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
															
 
																+}
															
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -0,0 +1,23 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2013  Université de Bordeaux 1
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __ALLOC_H__
															
 
																+#define __ALLOC_H__
															
 
																+
															
 
																+void _starpu_malloc_init(void);
															
 
																+void _starpu_malloc_shutdown(unsigned dst_node);
															
 
																+
															
 
																+#endif
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -509,6 +509,8 @@ int _starpu_cuda_driver_deinit(struct starpu_driver *d)
 
																 	 * coherency is not maintained anymore at that point ! */
															
 
																 	_starpu_free_all_automatically_allocated_buffers(memnode);
															
 
																+	_starpu_malloc_shutdown(memnode);
															
 
																+
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 	deinit_context(args->workerid);
															
 
																 #endif