Browse Source

port r11312 from 1.1: Small CUDA allocations (<= 4MiB) are now batched to avoid the huge cudaMalloc overhead.

Samuel Thibault 12 years ago
parent
commit
0e385974a9
6 changed files with 332 additions and 3 deletions
  1. 2 0
      ChangeLog
  2. 1 0
      src/Makefile.am
  3. 3 0
      src/core/workers.c
  4. 301 3
      src/datawizard/malloc.c
  5. 23 0
      src/datawizard/malloc.h
  6. 2 0
      src/drivers/cuda/driver_cuda.c

+ 2 - 0
ChangeLog

@@ -50,6 +50,8 @@ New features:
   * Tasks can now define a optional prologue callback which is executed
   * Tasks can now define a optional prologue callback which is executed
     on the host when the task becomes ready for execution, before getting
     on the host when the task becomes ready for execution, before getting
     scheduled.
     scheduled.
+  * Small CUDA allocations (<= 4MiB) are now batched to avoid the huge
+    cudaMalloc overhead.
 
 
 Small features:
 Small features:
   * New functions starpu_data_acquire_cb_sequential_consistency() and
   * New functions starpu_data_acquire_cb_sequential_consistency() and

+ 1 - 0
src/Makefile.am

@@ -88,6 +88,7 @@ noinst_HEADERS = 						\
 	datawizard/filters.h					\
 	datawizard/filters.h					\
 	datawizard/write_back.h					\
 	datawizard/write_back.h					\
 	datawizard/datastats.h					\
 	datawizard/datastats.h					\
+	datawizard/malloc.h					\
 	datawizard/memstats.h					\
 	datawizard/memstats.h					\
 	datawizard/memory_manager.h				\
 	datawizard/memory_manager.h				\
 	datawizard/memalloc.h					\
 	datawizard/memalloc.h					\

+ 3 - 0
src/core/workers.c

@@ -27,6 +27,7 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/disk.h>
 #include <core/disk.h>
 #include <core/task.h>
 #include <core/task.h>
+#include <datawizard/malloc.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <starpu_task_list.h>
 #include <starpu_task_list.h>
 #include <drivers/mp_common/sink_common.h>
 #include <drivers/mp_common/sink_common.h>
@@ -996,6 +997,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 
 	_starpu_data_interface_init();
 	_starpu_data_interface_init();
 
 
+	_starpu_malloc_init();
+
 	_starpu_timing_init();
 	_starpu_timing_init();
 
 
 	_starpu_profiling_init();
 	_starpu_profiling_init();

+ 301 - 3
src/datawizard/malloc.c

@@ -354,8 +354,8 @@ static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZE
 static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 #endif
 #endif
 
 
-uintptr_t
-starpu_malloc_on_node(unsigned dst_node, size_t size)
+static uintptr_t
+_starpu_malloc_on_node(unsigned dst_node, size_t size)
 {
 {
 	uintptr_t addr = 0;
 	uintptr_t addr = 0;
 
 
@@ -462,7 +462,7 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 }
 }
 
 
 void
 void
-starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
+_starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 {
 {
 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 	switch(kind)
 	switch(kind)
@@ -533,3 +533,301 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
 
 }
 }
 
 
+/*
+ * On CUDA which has very expensive malloc, for small sizes, allocate big
+ * chunks divided in blocks, and we actually allocate segments of consecutive
+ * blocks.
+ *
+ * We try to keep the list of chunks with increasing occupancy, so we can
+ * quickly find free segments to allocate.
+ */
+
+/* Size of each chunk, 32MiB granularity brings 128 chunks to be allocated in
+ * order to fill a 4GiB GPU. */
+#define CHUNK_SIZE (32*1024*1024)
+
+/* Maximum segment size we will allocate in chunks */
+#define CHUNK_ALLOC_MAX (CHUNK_SIZE / 8)
+
+/* Granularity of allocation, i.e. block size, StarPU will never allocate less
+ * than this.
+ * 16KiB (i.e. 64x64 float) granularity eats 2MiB RAM for managing a 4GiB GPU.
+ */
+#define CHUNK_ALLOC_MIN (16*1024)
+
+/* Number of blocks */
+#define CHUNK_NBLOCKS (CHUNK_SIZE/CHUNK_ALLOC_MIN)
+
+/* Linked list for available segments */
+struct block {
+	int length;	/* Number of consecutive free blocks */
+	int next;	/* next free segment */
+};
+
+/* One chunk */
+LIST_TYPE(_starpu_chunk,
+	uintptr_t base;
+
+	/* Available number of blocks, for debugging */
+	int available;
+
+	/* Overestimation of the maximum size of available segments in this chunk */
+	int available_max;
+
+	/* Bitmap describing availability of the block */
+	/* Block 0 is always empty, and is just the head of the free segments list */
+	struct block bitmap[CHUNK_NBLOCKS+1];
+)
+
+/* One list of chunks per node */
+static struct _starpu_chunk_list *chunks[STARPU_MAXNODES];
+/* Number of completely free chunks */
+static int nfreechunks[STARPU_MAXNODES];
+/* This protects chunks and nfreechunks */
+static starpu_pthread_mutex_t chunk_mutex[STARPU_MAXNODES];
+
+void
+_starpu_malloc_init(void)
+{
+	int i;
+	for (i = 0; i < STARPU_MAXNODES; i++)
+	{
+		chunks[i] = _starpu_chunk_list_new();
+		nfreechunks[i] = 0;
+		STARPU_PTHREAD_MUTEX_INIT(&chunk_mutex[i], NULL);
+	}
+}
+
+void
+_starpu_malloc_shutdown(unsigned dst_node)
+{
+	struct _starpu_chunk *chunk, *next_chunk;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
+	     chunk = next_chunk)
+	{
+		next_chunk = _starpu_chunk_list_next(chunk);
+		STARPU_ASSERT(chunk->available == CHUNK_NBLOCKS);
+		_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
+		_starpu_chunk_list_erase(chunks[dst_node], chunk);
+		free(chunk);
+	}
+	_starpu_chunk_list_delete(chunks[dst_node]);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
+	STARPU_PTHREAD_MUTEX_DESTROY(&chunk_mutex[dst_node]);
+}
+
+/* Create a new chunk */
+static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
+{
+	struct _starpu_chunk *chunk;
+	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE);
+
+	if (!base)
+		return NULL;
+
+	/* Create a new chunk */
+	chunk = _starpu_chunk_new();
+	chunk->base = base;
+
+	/* First block is just a fake block pointing to the free segments list */
+	chunk->bitmap[0].length = 0;
+	chunk->bitmap[0].next = 1;
+
+	/* At first we have only one big segment for the whole chunk */
+	chunk->bitmap[1].length = CHUNK_NBLOCKS;
+	chunk->bitmap[1].next = -1;
+
+	chunk->available_max = CHUNK_NBLOCKS;
+	chunk->available = CHUNK_NBLOCKS;
+	return chunk;
+}
+
+uintptr_t
+starpu_malloc_on_node(unsigned dst_node, size_t size)
+{
+	/* Big allocation, allocate normally */
+	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
+		return _starpu_malloc_on_node(dst_node, size);
+
+	/* Round up allocation to block size */
+	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
+
+	struct _starpu_chunk *chunk;
+	int prevblock, block;
+	int available_max;
+	struct block *bitmap;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
+
+	/* Try to find a big enough segment among the chunks */
+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
+	     chunk = _starpu_chunk_list_next(chunk))
+	{
+		if (chunk->available_max < nblocks)
+			continue;
+
+		bitmap = chunk->bitmap;
+		available_max = 0;
+		for (prevblock = block = 0;
+			block != -1;
+			prevblock = block, block = bitmap[prevblock].next)
+		{
+			STARPU_ASSERT(block >= 0 && block <= CHUNK_NBLOCKS);
+			int length = bitmap[block].length;
+			if (length >= nblocks) {
+
+				if (length >= 2*nblocks)
+				{
+					/* This one this has quite some room,
+					 * put it front, to make finding it
+					 * easier next time. */
+					_starpu_chunk_list_erase(chunks[dst_node], chunk);
+					_starpu_chunk_list_push_front(chunks[dst_node], chunk);
+				}
+				if (chunk->available == CHUNK_NBLOCKS)
+					/* This one was empty, it's not empty any more */
+					nfreechunks[dst_node]--;
+				goto found;
+			}
+			if (length > available_max)
+				available_max = length;
+		}
+
+		/* Didn't find a big enough segment in this chunk, its
+		 * available_max is out of date */
+		chunk->available_max = available_max;
+	}
+
+	/* Didn't find a big enough segment, create another chunk.  */
+	chunk = _starpu_new_chunk(dst_node);
+	if (!chunk)
+	{
+		/* Really no memory any more, fail */
+		STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
+		errno = ENOMEM;
+		return 0;
+	}
+
+	/* And make it easy to find. */
+	_starpu_chunk_list_push_front(chunks[dst_node], chunk);
+	bitmap = chunk->bitmap;
+	prevblock = 0;
+	block = 1;
+
+found:
+
+	chunk->available -= nblocks;
+	STARPU_ASSERT(bitmap[block].length >= nblocks);
+	STARPU_ASSERT(block <= CHUNK_NBLOCKS);
+	if (bitmap[block].length == nblocks)
+	{
+		/* Fits exactly, drop this segment from the skip list */
+		bitmap[prevblock].next = bitmap[block].next;
+	}
+	else
+	{
+		/* Still some room */
+		STARPU_ASSERT(block + nblocks <= CHUNK_NBLOCKS);
+		bitmap[prevblock].next = block + nblocks;
+		bitmap[block + nblocks].length = bitmap[block].length - nblocks;
+		bitmap[block + nblocks].next = bitmap[block].next;
+	}
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
+
+	return chunk->base + (block-1) * CHUNK_ALLOC_MIN;
+}
+
+void
+starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
+{
+	/* Big allocation, deallocate normally */
+	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
+	{
+		_starpu_free_on_node(dst_node, addr, size);
+		return;
+	}
+
+	struct _starpu_chunk *chunk;
+
+	/* Round up allocation to block size */
+	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
+
+	STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
+	for (chunk = _starpu_chunk_list_begin(chunks[dst_node]);
+	     chunk != _starpu_chunk_list_end(chunks[dst_node]);
+	     chunk = _starpu_chunk_list_next(chunk))
+		if (addr >= chunk->base && addr < chunk->base + CHUNK_SIZE)
+			break;
+	STARPU_ASSERT(chunk != _starpu_chunk_list_end(chunks[dst_node]));
+
+	struct block *bitmap = chunk->bitmap;
+	int block = ((addr - chunk->base) / CHUNK_ALLOC_MIN) + 1, prevblock, nextblock;
+
+	/* Look for free segment just before this one */
+	for (prevblock = 0;
+		prevblock != -1;
+		prevblock = nextblock)
+	{
+		STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
+		nextblock = bitmap[prevblock].next;
+		if (nextblock > block || nextblock == -1)
+			break;
+	}
+	STARPU_ASSERT(prevblock != -1);
+
+	chunk->available += nblocks;
+
+	/* Insert in free segments list */
+	bitmap[block].next = nextblock;
+	bitmap[prevblock].next = block;
+	bitmap[block].length = nblocks;
+
+	STARPU_ASSERT(nextblock >= -1 && nextblock <= CHUNK_NBLOCKS);
+	if (nextblock == block + nblocks)
+	{
+		/* This freed segment is just before a free segment, merge them */
+		bitmap[block].next = bitmap[nextblock].next;
+		bitmap[block].length += bitmap[nextblock].length;
+
+		if (bitmap[block].length > chunk->available_max)
+			chunk->available_max = bitmap[block].length;
+	}
+
+	if (prevblock > 0 && prevblock + bitmap[prevblock].length == block)
+	{
+		/* This free segment is just after a free segment, merge them */
+		bitmap[prevblock].next = bitmap[block].next;
+		bitmap[prevblock].length += bitmap[block].length;
+
+		if (bitmap[prevblock].length > chunk->available_max)
+			chunk->available_max = bitmap[prevblock].length;
+
+		block = prevblock;
+	}
+
+	if (chunk->available == CHUNK_NBLOCKS)
+	{
+		/* This chunk is now empty, but avoid chunk free/alloc
+		 * ping-pong by keeping some of these.  */
+		if (nfreechunks[dst_node] >= 1) {
+			/* We already have free chunks, release this one */
+			_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
+			_starpu_chunk_list_erase(chunks[dst_node], chunk);
+			free(chunk);
+		} else
+			nfreechunks[dst_node]++;
+	}
+	else
+	{
+		/* Freed some room, put this first in chunks list */
+		_starpu_chunk_list_erase(chunks[dst_node], chunk);
+		_starpu_chunk_list_push_front(chunks[dst_node], chunk);
+	}
+
+	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
+}

+ 23 - 0
src/datawizard/malloc.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2013  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __ALLOC_H__
+#define __ALLOC_H__
+
+void _starpu_malloc_init(void);
+void _starpu_malloc_shutdown(unsigned dst_node);
+
+#endif

+ 2 - 0
src/drivers/cuda/driver_cuda.c

@@ -509,6 +509,8 @@ int _starpu_cuda_driver_deinit(struct starpu_driver *d)
 	 * coherency is not maintained anymore at that point ! */
 	 * coherency is not maintained anymore at that point ! */
 	_starpu_free_all_automatically_allocated_buffers(memnode);
 	_starpu_free_all_automatically_allocated_buffers(memnode);
 
 
+	_starpu_malloc_shutdown(memnode);
+
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
 	deinit_context(args->workerid);
 	deinit_context(args->workerid);
 #endif
 #endif