Browse Source

Disable quick-allocation for pinned allocation

We used to optimize allocation requests by allocating immediately when it's
"just RAM". But when we are pinning it for CUDA this is actually expensive
(or worse, wait for an allocation task in the case of !CUDA_MEMCPY_PEER), so
we do rather defer it to some point when we have the time to do it (possibly
an idle worker or during a CUDA task).
Samuel Thibault 4 years ago
parent
commit
51eda9d44c
3 changed files with 21 additions and 3 deletions
  1. 4 3
      src/datawizard/coherency.c
  2. 10 0
      src/datawizard/malloc.c
  3. 7 0
      src/datawizard/malloc.h

+ 4 - 3
src/datawizard/coherency.c

@@ -568,10 +568,11 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
 		if (mode & STARPU_W)
 			dst_replicate->initialized = 1;
-		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
+			&& !_starpu_malloc_willpin_on_node(requesting_node))
 		{
-			/* And this is the main RAM, really no need for a
-			 * request, just allocate */
+			/* And this is the main RAM without pinning, really no need for a
+			 * request, just quickly allocate and be done */
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);

+ 10 - 0
src/datawizard/malloc.c

@@ -149,6 +149,15 @@ static int _starpu_malloc_should_pin(int flags)
 	return 0;
 }
 
+int _starpu_malloc_willpin_on_node(unsigned dst_node)
+{
+	int flags = malloc_on_node_default_flags[dst_node];
+	return (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0
+			&& (_starpu_can_submit_cuda_task()
+			    /* || _starpu_can_submit_opencl_task() */
+			));
+}
+
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
 {
 	int ret=0;
@@ -185,6 +194,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		goto end;
 	}
 
+	/* Note: synchronize this test with _starpu_malloc_willpin_on_node */
 	if (_starpu_malloc_should_pin(flags) && STARPU_RUNNING_ON_VALGRIND == 0)
 	{
 		if (_starpu_can_submit_cuda_task())

+ 7 - 0
src/datawizard/malloc.h

@@ -26,4 +26,11 @@ void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
 
 int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
 int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
+
+/**
+   Returns whether when allocating data on \p dst_node, we will do pinning, i.e.
+   the allocation will be very expensive, and should thus be moved out from the
+   critical path
+  */
+int _starpu_malloc_willpin_on_node(unsigned dst_node);
 #endif