Sfoglia il codice sorgente

Liberate the memory that was implicitely allocated by the DSM to replicate data
when StarPU is shut down. This is typically useful in the case of write-only
buffers which are never explicitely allocated by the programmer.

Cédric Augonnet 15 anni fa
parent
commit
c0ba4bf5e9

+ 80 - 29
src/datawizard/memalloc.c

@@ -188,8 +188,7 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 	}
 }
 
-
-static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node, unsigned attempts)
+static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 {
 	size_t liberated = 0;
 
@@ -199,13 +198,6 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node, unsign
 
 	STARPU_ASSERT(handle);
 
-	if (attempts == 0)
-	{
-		/* this is the first attempt to free memory
-		   so we avoid to drop requested memory */
-		/* TODO */
-	}
-
 	/* try to lock all the leafs of the subtree */
 	lock_all_subtree(handle);
 
@@ -360,22 +352,16 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 }
 #endif
 
-/* 
- * Try to free some memory on the specified node
- * 	returns 0 if no memory was released, 1 else
+/*
+ * Liberate the memory chuncks that are explicitely tagged to be liberated. The
+ * mc_rwlock[node] rw-lock should be taken prior to calling this function.
  */
-static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)), unsigned attempts)
+static size_t perform_mc_removal_requests(uint32_t node)
 {
-//	fprintf(stderr, "reclaim memory...\n");
-
-	int res;
+	starpu_mem_chunk_t mc, next_mc;
+	
 	size_t liberated = 0;
 
-	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
-	STARPU_ASSERT(!res);
-
-	/* remove all buffers for which there was a removal request */
-	starpu_mem_chunk_t mc, next_mc;
 	for (mc = starpu_mem_chunk_list_begin(mc_list_to_free[node]);
 	     mc != starpu_mem_chunk_list_end(mc_list_to_free[node]);
 	     mc = next_mc)
@@ -390,7 +376,21 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 		starpu_mem_chunk_delete(mc);
 	}
 
-	/* try to free all allocated data potentially in use .. XXX */
+	return liberated;
+}
+
+/*
+ * Try to liberate the buffers currently in use on the memory node. If the
+ * force flag is set, the memory is liberated regardless of coherency concerns
+ * (this should only be used at the termination of StarPU for instance). The
+ * mc_rwlock[node] rw-lock should be taken prior to calling this function.
+ */
+static size_t liberate_potentially_in_use_mc(uint32_t node, unsigned force)
+{
+	size_t liberated = 0;
+
+	starpu_mem_chunk_t mc, next_mc;
+
 	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
 	     mc != starpu_mem_chunk_list_end(mc_list[node]);
 	     mc = next_mc)
@@ -400,14 +400,63 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 		   element of the list now */
 		next_mc = starpu_mem_chunk_list_next(mc);
 
-		liberated += try_to_free_mem_chunk(mc, node, attempts);
-		#if 0
-		if (liberated > toreclaim)
-			break;
-		#endif
+		if (!force)
+		{
+			liberated += try_to_free_mem_chunk(mc, node);
+			#if 0
+			if (liberated > toreclaim)
+				break;
+			#endif
+		}
+		else {
+			/* We must liberate the memory now: note that data
+			 * coherency is not maintained in that case ! */
+			liberated += do_free_mem_chunk(mc, node);
+		}
 	}
+}
 
-//	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
+/* 
+ * Try to free some memory on the specified node
+ * 	returns 0 if no memory was released, 1 else
+ */
+
+static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)))
+{
+	int res;
+	size_t liberated = 0;
+
+	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	/* remove all buffers for which there was a removal request */
+	liberated += perform_mc_removal_requests(node);
+
+	/* try to free all allocated data potentially in use */
+	liberated += liberate_potentially_in_use_mc(node, 0);
+
+	res = pthread_rwlock_unlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	return liberated;
+}
+
+/*
+ * This function liberates all the memory that was implicitely allocated by
+ * StarPU (for the data replicates). This is not ensuring data coherency, and
+ * should only be called while StarPU is getting shut down.
+ */
+size_t _starpu_liberate_all_automatically_allocated_buffers(uint32_t node)
+{
+	int res;
+
+	size_t liberated = 0;
+
+	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	liberated += perform_mc_removal_requests(node);
+	liberated += liberate_potentially_in_use_mc(node, 1);
 
 	res = pthread_rwlock_unlock(&mc_rwlock[node]);
 	STARPU_ASSERT(!res);
@@ -415,6 +464,8 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 	return liberated;
 }
 
+
+
 static void register_mem_chunk(starpu_data_handle handle, uint32_t dst_node, size_t size, unsigned automatically_allocated)
 {
 	int res;
@@ -578,7 +629,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, uint32_t dst_node
 			size_t data_size = handle->ops->get_size(handle);
 
 			STARPU_TRACE_START_MEMRECLAIM(dst_node);
-			reclaim_memory(dst_node, 2*data_size, attempts);
+			reclaim_memory(dst_node, 2*data_size);
 			STARPU_TRACE_END_MEMRECLAIM(dst_node);
 		}
 		

+ 1 - 0
src/datawizard/memalloc.h

@@ -48,5 +48,6 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node);
 int _starpu_allocate_memory_on_node(starpu_data_handle handle, uint32_t dst_node, unsigned may_alloc);
+size_t _starpu_liberate_all_automatically_allocated_buffers(uint32_t node);
 
 #endif

+ 5 - 0
src/drivers/cpu/driver_cpu.c

@@ -193,6 +193,11 @@ void *_starpu_cpu_worker(void *arg)
 
 	STARPU_TRACE_WORKER_DEINIT_START
 
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_liberate_all_automatically_allocated_buffers(memnode);
+
 #ifdef STARPU_DATA_STATS
 	fprintf(stderr, "CPU #%d computation %le comm %le (%lf \%%)\n", cpu_arg->id, cpu_arg->jobq->total_computation_time, cpu_arg->jobq->total_communication_time,  cpu_arg->jobq->total_communication_time*100.0/cpu_arg->jobq->total_computation_time);
 #endif

+ 5 - 0
src/drivers/cuda/driver_cuda.c

@@ -278,6 +278,11 @@ void *_starpu_cuda_worker(void *arg)
 
 	STARPU_TRACE_WORKER_DEINIT_START
 
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_liberate_all_automatically_allocated_buffers(memnode);
+
 	deinit_context(args->workerid);
 
 #ifdef STARPU_DATA_STATS