Browse Source

Add starpu_memory_pin and starpu_memory_unpin

Samuel Thibault 10 years ago
parent
commit
2290e3bdab

+ 2 - 0
ChangeLog

@@ -130,6 +130,8 @@ Small features:
     its own allocation to the reclaiming engine.
   * Add STARPU_SIMGRID_CUDA_MALLOC_COST and STARPU_SIMGRID_CUDA_QUEUE_COST to
     disable CUDA costs simulation in simgrid mode.
+  * Add starpu_memory_pin and starpu_memory_unpin to pin memory allocated
+    another way than starpu_malloc.
 
 Changes:
   * Data interfaces (variable, vector, matrix and block) now define

+ 3 - 2
doc/doxygen/chapters/07data_management.doxy

@@ -13,8 +13,9 @@ intro qui parle de coherency entre autres
 \section DataManagement Data Management
 
 When the application allocates data, whenever possible it should use
-the function starpu_malloc(), which will ask CUDA or OpenCL to make
-the allocation itself and pin the corresponding allocated memory. This
+the starpu_malloc() function, which will ask CUDA or OpenCL to make
+the allocation itself and pin the corresponding allocated memory, or to use the
+starpu_memory_pin() function to pin memory allocated by other ways, such as local arrays. This
 is needed to permit asynchronous data transfer, i.e. permit data
 transfer to overlap with computations. Otherwise, the trace will show
 that the <c>DriverCopyAsync</c> state takes a lot of time, this is

+ 1 - 1
doc/doxygen/chapters/40environment_variables.doxy

@@ -303,7 +303,7 @@ This permits to test the performance effect of GPU-Direct.
 <dd>
 \anchor STARPU_DISABLE_PINNING
 \addindex __env__STARPU_DISABLE_PINNING
-Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc
+Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc, starpu_memory_pin
 and friends.  The default is Enabled.
 This permits to test the performance effect of memory pinning.
 </dd>

+ 11 - 0
doc/doxygen/chapters/api/standard_memory_library.doxy

@@ -75,6 +75,17 @@ This function frees memory by specifying its size. The given
 flags should be consistent with the ones given to starpu_malloc_flags()
 when allocating the memory.
 
+\fn int starpu_memory_pin(void *addr, size_t size)
+\ingroup API_Standard_Memory_Library
+This function pins the given memory area, so that CPU-GPU transfers can be done
+asynchronously with DMAs. The memory must be unpinned with
+starpu_memory_unpin() before being freed. Returns 0 on success, -1 on error.
+
+\fn int starpu_memory_unpin(void *addr, size_t size)
+\ingroup API_Standard_Memory_Library
+This function unpins the given memory area previously pinned with
+starpu_memory_pin(). Returns 0 on success, -1 on error.
+
 \fn ssize_t starpu_memory_get_total(unsigned node)
 \ingroup API_Standard_Memory_Library
 If a memory limit is defined on the given node (see Section \ref

+ 4 - 1
include/starpu_stdlib.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2014  Université de Bordeaux
+ * Copyright (C) 2010-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -37,6 +37,9 @@ int starpu_free(void *A);
 int starpu_malloc_flags(void **A, size_t dim, int flags);
 int starpu_free_flags(void *A, size_t dim, int flags);
 
+int starpu_memory_pin(void *addr, size_t size);
+int starpu_memory_unpin(void *addr, size_t size);
+
 starpu_ssize_t starpu_memory_get_total(unsigned node);
 starpu_ssize_t starpu_memory_get_available(unsigned node);
 void starpu_memory_wait_available(unsigned node, size_t size);

+ 26 - 0
src/datawizard/malloc.c

@@ -601,6 +601,32 @@ _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
 
 }
 
+int
+starpu_memory_pin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+	if (STARPU_MALLOC_PINNED && starpu_get_env_number("STARPU_DISABLE_PINNING") <= 0 && RUNNING_ON_VALGRIND == 0)
+	{
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (cudaHostRegister(addr, size, cudaHostRegisterPortable) != cudaSuccess)
+			return -1;
+#endif
+	}
+	return 0;
+}
+
+int
+starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED)
+{
+	if (STARPU_MALLOC_PINNED && starpu_get_env_number("STARPU_DISABLE_PINNING") <= 0 && RUNNING_ON_VALGRIND == 0)
+	{
+#if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
+		if (cudaHostUnregister(addr) != cudaSuccess)
+			return -1;
+#endif
+	}
+	return 0;
+}
+
 /*
  * On CUDA which has very expensive malloc, for small sizes, allocate big
  * chunks divided in blocks, and we actually allocate segments of consecutive