10 gadi atpakaļ · dbafc4b732
--- a/ChangeLog
+++ b/ChangeLog
@@ -158,6 +158,9 @@ Small features:
 
				   * Add STARPU_NAME to specify a task name from a starpu_task_insert call.
			
 
				   * Add starpu_task_get_task_succs to get the list of children of a given
			
 
				     task.
			
 
				+  * Add starpu_malloc_on_node_flags, starpu_free_on_node_flags, and
			
 
				+    starpu_malloc_on_node_set_default_flags to control the allocation flags
			
 
				+    used for allocations done by starpu.
			
 
				 
			
 
				 Changes:
			
 
				   * Data interfaces (variable, vector, matrix and block) now define
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -1015,9 +1015,20 @@ designated by \p interface.
 
				 Applications can provide their own interface as shown in \ref
			
 
				 DefiningANewDataInterface.
			
 
				 
			
 
				+\fn uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
			
 
				+\ingroup API_Data_Interfaces
			
 
				+Allocate \p size bytes on node \p dst_node with the given allocation \p flags. This returns 0 if
			
 
				+allocation failed, the allocation method should then return <c>-ENOMEM</c> as
			
 
				+allocated size. Deallocation must be done with starpu_free_on_node.
			
 
				+
			
 
				+\fn void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
			
 
				+\ingroup API_Data_Interfaces
			
 
				+Free \p addr of \p size bytes on node \p dst_node which was previously allocated
			
 
				+with starpu_malloc_on_node with the given allocation \p flags.
			
 
				+
			
 
				 \fn uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size)
			
 
				 \ingroup API_Data_Interfaces
			
 
				-Allocate \p size bytes on node \p dst_node. This returns 0 if
			
 
				+Allocate \p size bytes on node \p dst_node with the default allocation flags. This returns 0 if
			
 
				 allocation failed, the allocation method should then return <c>-ENOMEM</c> as
			
 
				 allocated size. Deallocation must be done with starpu_free_on_node.
			
 
				 
			
@@ -1026,6 +1037,11 @@ allocated size. Deallocation must be done with starpu_free_on_node.
 
				 Free \p addr of \p size bytes on node \p dst_node which was previously allocated
			
 
				 with starpu_malloc_on_node.
			
 
				 
			
 
				+\fn void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
			
 
				+\ingroup API_Data_Interfaces
			
 
				+Define the defaultflags for allocations performed by starpu_malloc_on_node() and
			
 
				+starpu_free_on_node(). The default is STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT.
			
 
				+
			
 
				 \fn int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
			
 
				 \ingroup API_Data_Interfaces
			
 
				 Copy \p size bytes from byte offset \p src_offset of \p src on \p src_node
			
--- a/doc/doxygen/chapters/api/standard_memory_library.doxy
+++ b/doc/doxygen/chapters/api/standard_memory_library.doxy
@@ -39,10 +39,11 @@ starpu_free_flags() with the same flag.
 
				 Value passed to the function starpu_malloc_flags() along STARPU_MALLOC_COUNT
			
 
				 to indicate that while the memory allocation should be kept in the limits
			
 
				 defined for STARPU_MALLOC_COUNT, no reclaiming should be performed by
			
 
				-starpu_malloc_flags itself, thus potentially overflowing the memory node a
			
 
				-bit. StarPU will reclaim memory after next task termination, according to
			
 
				-the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM environment
			
 
				-variables.
			
 
				+starpu_malloc_flags itself, thus potentially overflowing the
			
 
				+memory node a bit. StarPU will reclaim memory after next task termination,
			
 
				+according to the STARPU_MINIMUM_AVAILABLE_MEM and STARPU_TARGET_AVAILABLE_MEM
			
 
				+environment variables. If STARPU_MEMORY_WAIT is set, no overflowing will happen,
			
 
				+starpu_malloc_flags() will wait for other eviction mechanisms to release enough memory.
			
 
				 
			
 
				 \fn int starpu_malloc_flags(void **A, size_t dim, int flags)
			
 
				 \ingroup API_Standard_Memory_Library
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -86,8 +86,11 @@ struct starpu_data_copy_methods
 
				 };
			
 
				 
			
 
				 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
			
 
				+uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
			
 
				 uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
			
 
				+void starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags);
			
 
				 void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
			
 
				+void starpu_malloc_on_node_set_default_flags(unsigned node, int flags);
			
 
				 
			
 
				 enum starpu_data_interface_id
			
 
				 {
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -26,8 +26,11 @@ extern "C"
 
				 #endif
			
 
				 
			
 
				 #define STARPU_MALLOC_PINNED	((1ULL)<<1)
			
 
				-#define STARPU_MALLOC_COUNT	((1ULL)<<3)
			
 
				-#define STARPU_MALLOC_NORECLAIM	((1ULL)<<4)
			
 
				+#define STARPU_MALLOC_COUNT	((1ULL)<<2)
			
 
				+#define STARPU_MALLOC_NORECLAIM	((1ULL)<<3)
			
 
				+
			
 
				+#define STARPU_MEMORY_WAIT	((1ULL)<<4)
			
 
				+#define STARPU_MEMORY_OVERFLOW	((1ULL)<<5)
			
 
				 
			
 
				 void starpu_malloc_set_align(size_t align);
			
 
				 
			
@@ -44,9 +47,6 @@ starpu_ssize_t starpu_memory_get_total(unsigned node);
 
				 starpu_ssize_t starpu_memory_get_available(unsigned node);
			
 
				 void starpu_memory_wait_available(unsigned node, size_t size);
			
 
				 
			
 
				-#define STARPU_MEMORY_WAIT (1)
			
 
				-#define STARPU_MEMORY_OVERFLOW (2)
			
 
				-
			
 
				 /**
			
 
				  * Try to allocate memory on the given node
			
 
				  *
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -30,6 +30,7 @@
 
				 
			
 
				 static size_t _malloc_align = sizeof(void*);
			
 
				 static int disable_pinning;
			
 
				+static int malloc_on_node_default_flags[STARPU_MAXNODES];
			
 
				 
			
 
				 void starpu_malloc_set_align(size_t align)
			
 
				 {
			
@@ -96,7 +97,7 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 	if (flags & STARPU_MALLOC_COUNT)
			
 
				 	{
			
 
				 		if (!(flags & STARPU_MALLOC_NORECLAIM))
			
 
				-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, 0) != 0)
			
 
				+			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
			
 
				 			{
			
 
				 				size_t freed;
			
 
				 				size_t reclaim = 2 * dim;
			
@@ -104,15 +105,17 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
			
 
				 				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
			
 
				 				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
			
 
				-				if (freed < dim)
			
 
				+				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
			
 
				 				{
			
 
				 					// We could not reclaim enough memory
			
 
				 					*A = NULL;
			
 
				 					return -ENOMEM;
			
 
				 				}
			
 
				 			}
			
 
				+		else if (flags & STARPU_MEMORY_WAIT)
			
 
				+			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
			
 
				 		else
			
 
				-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, STARPU_MEMORY_OVERFLOW);
			
 
				+			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
			
 
				 	}
			
 
				 
			
 
				 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
			
@@ -387,7 +390,7 @@ static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALI
 
				 #endif
			
 
				 
			
 
				 static uintptr_t
			
 
				-_starpu_malloc_on_node(unsigned dst_node, size_t size)
			
 
				+_starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
			
 
				 {
			
 
				 	uintptr_t addr = 0;
			
 
				 
			
@@ -395,8 +398,14 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				 	cudaError_t status;
			
 
				 #endif
			
 
				 
			
 
				-	if (starpu_memory_allocate(dst_node, size, 0) != 0)
			
 
				-		return 0;
			
 
				+	/* Handle count first */
			
 
				+	if (flags & STARPU_MALLOC_COUNT)
			
 
				+	{
			
 
				+		if (starpu_memory_allocate(dst_node, size, flags) != 0)
			
 
				+			return 0;
			
 
				+		/* And prevent double-count in starpu_malloc_flags */
			
 
				+		flags &= ~STARPU_MALLOC_COUNT;
			
 
				+	}
			
 
				 
			
 
				 	switch(starpu_node_get_kind(dst_node))
			
 
				 	{
			
@@ -409,9 +418,9 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				 					 * requires waiting for a task, and we
			
 
				 					 * may be called with a spinlock held
			
 
				 					 */
			
 
				-					0
			
 
				+					flags & ~STARPU_MALLOC_PINNED
			
 
				 #else
			
 
				-					STARPU_MALLOC_PINNED
			
 
				+					flags
			
 
				 #endif
			
 
				 					);
			
 
				 			break;
			
@@ -518,17 +527,19 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				 }
			
 
				 
			
 
				 void
			
 
				-_starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
			
 
				+_starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
			
 
				 {
			
 
				+	int count = flags & STARPU_MALLOC_COUNT;
			
 
				+	flags &= ~STARPU_MALLOC_COUNT;
			
 
				 	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
			
 
				 	switch(kind)
			
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 			starpu_free_flags((void*)addr, size,
			
 
				 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				-					0
			
 
				+					flags & ~STARPU_MALLOC_PINNED
			
 
				 #else
			
 
				-					STARPU_MALLOC_PINNED
			
 
				+					flags
			
 
				 #endif
			
 
				 					);
			
 
				 			break;
			
@@ -605,7 +616,8 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 	}
			
 
				-	starpu_memory_deallocate(dst_node, size);
			
 
				+	if (count)
			
 
				+		starpu_memory_deallocate(dst_node, size);
			
 
				 
			
 
				 }
			
 
				 
			
@@ -696,6 +708,7 @@ _starpu_malloc_init(unsigned dst_node)
 
				 	nfreechunks[dst_node] = 0;
			
 
				 	STARPU_PTHREAD_MUTEX_INIT(&chunk_mutex[dst_node], NULL);
			
 
				 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
			
 
				+	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
			
 
				 }
			
 
				 
			
 
				 void
			
@@ -709,7 +722,7 @@ _starpu_malloc_shutdown(unsigned dst_node)
 
				 	     chunk = next_chunk)
			
 
				 	{
			
 
				 		next_chunk = _starpu_chunk_list_next(chunk);
			
 
				-		_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
			
 
				+		_starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, malloc_on_node_default_flags[dst_node]);
			
 
				 		_starpu_chunk_list_erase(&chunks[dst_node], chunk);
			
 
				 		free(chunk);
			
 
				 	}
			
@@ -718,10 +731,10 @@ _starpu_malloc_shutdown(unsigned dst_node)
 
				 }
			
 
				 
			
 
				 /* Create a new chunk */
			
 
				-static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
			
 
				+static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
			
 
				 {
			
 
				 	struct _starpu_chunk *chunk;
			
 
				-	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE);
			
 
				+	uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE, flags);
			
 
				 
			
 
				 	if (!base)
			
 
				 		return NULL;
			
@@ -744,11 +757,11 @@ static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node)
 
				 }
			
 
				 
			
 
				 uintptr_t
			
 
				-starpu_malloc_on_node(unsigned dst_node, size_t size)
			
 
				+starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
			
 
				 {
			
 
				 	/* Big allocation, allocate normally */
			
 
				 	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
			
 
				-		return _starpu_malloc_on_node(dst_node, size);
			
 
				+		return _starpu_malloc_on_node(dst_node, size, flags);
			
 
				 
			
 
				 	/* Round up allocation to block size */
			
 
				 	int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
			
@@ -802,7 +815,7 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 
				 	}
			
 
				 
			
 
				 	/* Didn't find a big enough segment, create another chunk.  */
			
 
				-	chunk = _starpu_new_chunk(dst_node);
			
 
				+	chunk = _starpu_new_chunk(dst_node, flags);
			
 
				 	if (!chunk)
			
 
				 	{
			
 
				 		/* Really no memory any more, fail */
			
@@ -842,12 +855,12 @@ found:
 
				 }
			
 
				 
			
 
				 void
			
 
				-starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
			
 
				+starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
			
 
				 {
			
 
				 	/* Big allocation, deallocate normally */
			
 
				 	if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
			
 
				 	{
			
 
				-		_starpu_free_on_node(dst_node, addr, size);
			
 
				+		_starpu_free_on_node_flags(dst_node, addr, size, flags);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -917,7 +930,7 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 		if (nfreechunks[dst_node] >= 1)
			
 
				 		{
			
 
				 			/* We already have free chunks, release this one */
			
 
				-			_starpu_free_on_node(dst_node, chunk->base, CHUNK_SIZE);
			
 
				+			_starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, flags);
			
 
				 			_starpu_chunk_list_erase(&chunks[dst_node], chunk);
			
 
				 			free(chunk);
			
 
				 		}
			
@@ -933,3 +946,20 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
			
 
				 }
			
 
				+
			
 
				+void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
			
 
				+{
			
 
				+	malloc_on_node_default_flags[node] = flags;
			
 
				+}
			
 
				+
			
 
				+uintptr_t
			
 
				+starpu_malloc_on_node(unsigned dst_node, size_t size)
			
 
				+{
			
 
				+	return starpu_malloc_on_node_flags(dst_node, size, malloc_on_node_default_flags[dst_node]);
			
 
				+}
			
 
				+
			
 
				+void
			
 
				+starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
			
 
				+{
			
 
				+	starpu_free_on_node_flags(dst_node, addr, size, malloc_on_node_default_flags[dst_node]);
			
 
				+}
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -20,6 +20,7 @@
 
				 #include <datawizard/datastats.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
 
				+#include <datawizard/malloc.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include "copy_driver.h"
			
 
				 #include "memalloc.h"
			
@@ -156,6 +157,8 @@ unsigned _starpu_memory_node_register(enum starpu_node_kind kind, int devid)
 
				 	/* for now, there is no condition associated to that newly created node */
			
 
				 	descr.condition_count[node] = 0;
			
 
				 
			
 
				+	_starpu_malloc_init(node);
			
 
				+
			
 
				 	return node;
			
 
				 }
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -602,8 +602,6 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 
				 
			
 
				 		_starpu_cuda_limit_gpu_mem_if_needed(devid);
			
 
				 		_starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
			
 
				-
			
 
				-		_starpu_malloc_init(memnode);
			
 
				 	}
			
 
				 
			
 
				 	/* one more time to avoid hacks from third party lib :) */
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -617,8 +617,6 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 
				 	_starpu_opencl_limit_gpu_mem_if_needed(devid);
			
 
				 	_starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
			
 
				 
			
 
				-	_starpu_malloc_init(worker->memory_node);
			
 
				-
			
 
				 	float size = (float) global_mem[devid] / (1<<30);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID