Pārlūkot izejas kodu

Make STARPU_MALLOC_ and STARPU_MEMORY_ flags coherent, so that we can pass the latter to starpu_malloc_flags too. Add _flags versions to starpu_malloc_on_node, and add starpu_malloc_on_node_set_default_flags to allow setting the flags used by starpu_malloc_on_node

Samuel Thibault 10 gadi atpakaļ
vecāks
revīzija
dbafc4b732

+ 3 - 0
ChangeLog

@@ -158,6 +158,9 @@ Small features:
   * Add STARPU_NAME to specify a task name from a starpu_task_insert call.
   * Add starpu_task_get_task_succs to get the list of children of a given
     task.
+  * Add starpu_malloc_on_node_flags, starpu_free_on_node_flags, and
+    starpu_malloc_on_node_set_default_flags to control the allocation flags
+    used for allocations done by starpu.
 
 Changes:
   * Data interfaces (variable, vector, matrix and block) now define

+ 17 - 1
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -1015,9 +1015,20 @@ designated by \p interface.
 Applications can provide their own interface as shown in \ref
 DefiningANewDataInterface.
 
+\fn uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
+\ingroup API_Data_Interfaces
+Allocate \p size bytes on node \p dst_node with the given allocation \p flags. This returns 0 if
+allocation failed, the allocation method should then return <c>-ENOMEM</c> as
+allocated size. Deallocation must be done with starpu_free_on_node.
+
+\fn void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+\ingroup API_Data_Interfaces
+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
+with starpu_malloc_on_node with the given allocation \p flags.
+
 \fn uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size)
 \ingroup API_Data_Interfaces
-Allocate \p size bytes on node \p dst_node. This returns 0 if
+Allocate \p size bytes on node \p dst_node with the default allocation flags. This returns 0 if
 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
 allocated size. Deallocation must be done with starpu_free_on_node.
 
@@ -1026,6 +1037,11 @@ allocated size. Deallocation must be done with starpu_free_on_node.
 Free \p addr of \p size bytes on node \p dst_node which was previously allocated
 with starpu_malloc_on_node.
 
+\fn void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
+\ingroup API_Data_Interfaces
+Define the defaultflags for allocations performed by starpu_malloc_on_node() and
+starpu_free_on_node(). The default is STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT.
+
 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 \ingroup API_Data_Interfaces
 Copy \p size bytes from byte offset \p src_offset of \p src on \p src_node

+ 5 - 4
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -39,10 +39,11 @@ starpu_free_flags() with the same flag.
 Value passed to the function starpu_malloc_flags() along STARPU_MALLOC_COUNT
 to indicate that while the memory allocation should be kept in the limits
 defined for STARPU_MALLOC_COUNT, no reclaiming should be performed by
-starpu_malloc_flags itself, thus potentially overflowing the memory node a
-bit. StarPU will reclaim memory after next task termination, according to
-the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM environment
-variables.
+starpu_malloc_flags itself, thus potentially overflowing the
+memory node a bit. StarPU will reclaim memory after next task termination,
+according to the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM
+environment variables. If STARPU_MEMORY_WAIT is set, no overflowing will happen,
+starpu_malloc_flags() will wait for other eviction mechanisms to release enough memory.
 
 \fn int starpu_malloc_flags(void **A, size_t dim, int flags)
 \ingroup API_Standard_Memory_Library

+ 3 - 0
include/starpu_data_interfaces.h

@@ -86,8 +86,11 @@ struct starpu_data_copy_methods
 };
 
 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
 uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
+void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
+void starpu_malloc_on_node_set_default_flags(unsigned node, int flags);
 
 enum starpu_data_interface_id
 {

+ 5 - 5
include/starpu_stdlib.h

@@ -26,8 +26,11 @@ extern "C"
 #endif
 
 #define STARPU_MALLOC_PINNED	((1ULL)<<1)
-#define STARPU_MALLOC_COUNT	((1ULL)<<3)
-#define STARPU_MALLOC_NORECLAIM	((1ULL)<<4)
+#define STARPU_MALLOC_COUNT	((1ULL)<<2)
+#define STARPU_MALLOC_NORECLAIM	((1ULL)<<3)
+
+#define STARPU_MEMORY_WAIT	((1ULL)<<4)
+#define STARPU_MEMORY_OVERFLOW	((1ULL)<<5)
 
 void starpu_malloc_set_align(size_t align);
 
@@ -44,9 +47,6 @@ starpu_ssize_t starpu_memory_get_total(unsigned node);
 starpu_ssize_t starpu_memory_get_available(unsigned node);
 void starpu_memory_wait_available(unsigned node, size_t size);
 
-#define STARPU_MEMORY_WAIT (1)
-#define STARPU_MEMORY_OVERFLOW (2)
-
 /**
  * Try to allocate memory on the given node
  *

+ 51 - 21
src/datawizard/malloc.c

@@ -30,6 +30,7 @@
 
 static size_t _malloc_align = sizeof(void*);
 static int disable_pinning;
+static int malloc_on_node_default_flags[STARPU_MAXNODES];
 
 void starpu_malloc_set_align(size_t align)
 {
@@ -96,7 +97,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	if (flags & STARPU_MALLOC_COUNT)
 	{
 		if (!(flags & STARPU_MALLOC_NORECLAIM))
-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, 0) != 0)
+			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
 			{
 				size_t freed;
 				size_t reclaim = 2 * dim;
@@ -104,15 +105,17 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
 				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
 				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
-				if (freed < dim)
+				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
 				{
 					// We could not reclaim enough memory
 					*A = NULL;
 					return -ENOMEM;
 				}
 			}
+		else if (flags & STARPU_MEMORY_WAIT)
+			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
 		else
-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, STARPU_MEMORY_OVERFLOW);
+			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
 	}
 
 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
@@ -387,7 +390,7 @@ static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALI
 #endif
 
 static uintptr_t
-_starpu_malloc_on_node(unsigned dst_node, size_t size)
+_starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
 	uintptr_t addr = 0;
 
@@ -395,8 +398,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 	cudaError_t status;
 #endif
 
-	if (starpu_memory_allocate(dst_node, size, 0) != 0)
-		return 0;
+	/* Handle count first */
+	if (flags & STARPU_MALLOC_COUNT)
+	{
+		if (starpu_memory_allocate(dst_node, size, flags) != 0)
+			return 0;
+		/* And prevent double-count in starpu_malloc_flags */
+		flags &= ~STARPU_MALLOC_COUNT;
+	}
 
 	switch(starpu_node_get_kind(dst_node))
 	{
@@ -409,9 +418,9 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 					 * requires waiting for a task, and we
 					 * may be called with a spinlock held
 					 */
-					0
+					flags & ~STARPU_MALLOC_PINNED
 #else
-					STARPU_MALLOC_PINNED
+					flags
 #endif
 					);
 			break;
@@ -518,17 +527,19 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 }
 
 void
-_starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
+_starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	int count = flags & STARPU_MALLOC_COUNT;
+	flags &= ~STARPU_MALLOC_COUNT;
 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
 	switch(kind)
 	{
 		case STARPU_CPU_RAM:
 			starpu_free_flags((void*)addr, size,
 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
-					0
+					flags & ~STARPU_MALLOC_PINNED
 #else
-					STARPU_MALLOC_PINNED
+					flags
 #endif
 					);
 			break;
@@ -605,7 +616,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 		default:
 			STARPU_ABORT();
 	}
-	starpu_memory_deallocate(dst_node, size);
+	if (count)
+		starpu_memory_deallocate(dst_node, size);
 
 }
 
@@ -696,6 +708,7 @@ _starpu_malloc_init(unsigned dst_node)
 	nfreechunks[dst_node] = 0;
 	STARPU_PTHREAD_MUTEX_INIT(&chunk_mutex[dst_node], NULL);
 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
+	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
 }
 
 void
@@ -709,7 +722,7 @@ _starpu_malloc_shutdown(unsigned dst_node)
 	     chunk = next_chunk)
 	{
 		next_chunk = _starpu_chunk_list_next(chunk);
-		_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
+		_starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, malloc_on_node_default_flags[dst_node]);
 		_starpu_chunk_list_erase(&chunks[dst_node], chunk);
 		free(chunk);
 	}
@@ -718,10 +731,10 @@ _starpu_malloc_shutdown(unsigned dst_node)
 }
 
 /* Create a new chunk */
-static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
+static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
 {
 	struct _starpu_chunk *chunk;
-	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE);
+	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE, flags);
 
 	if (!base)
 		return NULL;
@@ -744,11 +757,11 @@ static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
 }
 
 uintptr_t
-starpu_malloc_on_node(unsigned dst_node, size_t size)
+starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
 {
 	/* Big allocation, allocate normally */
 	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
-		return _starpu_malloc_on_node(dst_node, size);
+		return _starpu_malloc_on_node(dst_node, size, flags);
 
 	/* Round up allocation to block size */
 	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
@@ -802,7 +815,7 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 	}
 
 	/* Didn't find a big enough segment, create another chunk.  */
-	chunk = _starpu_new_chunk(dst_node);
+	chunk = _starpu_new_chunk(dst_node, flags);
 	if (!chunk)
 	{
 		/* Really no memory any more, fail */
@@ -842,12 +855,12 @@ found:
 }
 
 void
-starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
+starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
 	/* Big allocation, deallocate normally */
 	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
 	{
-		_starpu_free_on_node(dst_node, addr, size);
+		_starpu_free_on_node_flags(dst_node, addr, size, flags);
 		return;
 	}
 
@@ -917,7 +930,7 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 		if (nfreechunks[dst_node] >= 1)
 		{
 			/* We already have free chunks, release this one */
-			_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
+			_starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, flags);
 			_starpu_chunk_list_erase(&chunks[dst_node], chunk);
 			free(chunk);
 		}
@@ -933,3 +946,20 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
 	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
 }
+
+void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
+{
+	malloc_on_node_default_flags[node] = flags;
+}
+
+uintptr_t
+starpu_malloc_on_node(unsigned dst_node, size_t size)
+{
+	return starpu_malloc_on_node_flags(dst_node, size, malloc_on_node_default_flags[dst_node]);
+}
+
+void
+starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
+{
+	starpu_free_on_node_flags(dst_node, addr, size, malloc_on_node_default_flags[dst_node]);
+}

+ 3 - 0
src/datawizard/memory_nodes.c

@@ -20,6 +20,7 @@
 #include <datawizard/datastats.h>
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
+#include <datawizard/malloc.h>
 #include <common/fxt.h>
 #include "copy_driver.h"
 #include "memalloc.h"
@@ -156,6 +157,8 @@ unsigned _starpu_memory_node_register(enum starpu_node_kind kind, int devid)
 	/* for now, there is no condition associated to that newly created node */
 	descr.condition_count[node] = 0;
 
+	_starpu_malloc_init(node);
+
 	return node;
 }
 

+ 0 - 2
src/drivers/cuda/driver_cuda.c

@@ -602,8 +602,6 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
 		_starpu_cuda_limit_gpu_mem_if_needed(devid);
 		_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
-
-		_starpu_malloc_init(memnode);
 	}
 
 	/* one more time to avoid hacks from third party lib :) */

+ 0 - 2
src/drivers/opencl/driver_opencl.c

@@ -617,8 +617,6 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
 
-	_starpu_malloc_init(worker->memory_node);
-
 	float size = (float) global_mem[devid] / (1<<30);
 
 #ifdef STARPU_SIMGRID