소스 검색

Liberate the memory that was implicitely allocated by the DSM to replicate data
when StarPU is shut down. This is typically useful in the case of write-only
buffers which are never explicitely allocated by the programmer.

Cédric Augonnet 15 년 전
부모
커밋
c0ba4bf5e9
4개의 변경된 파일91개의 추가작업 그리고 29개의 파일을 삭제
  1. 80 29
      src/datawizard/memalloc.c
  2. 1 0
      src/datawizard/memalloc.h
  3. 5 0
      src/drivers/cpu/driver_cpu.c
  4. 5 0
      src/drivers/cuda/driver_cuda.c

+ 80 - 29
src/datawizard/memalloc.c

@@ -188,8 +188,7 @@ static void transfer_subtree_to_node(starpu_data_handle handle, unsigned src_nod
 	}
 }
 
-
-static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node, unsigned attempts)
+static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node)
 {
 	size_t liberated = 0;
 
@@ -199,13 +198,6 @@ static size_t try_to_free_mem_chunk(starpu_mem_chunk_t mc, unsigned node, unsign
 
 	STARPU_ASSERT(handle);
 
-	if (attempts == 0)
-	{
-		/* this is the first attempt to free memory
-		   so we avoid to drop requested memory */
-		/* TODO */
-	}
-
 	/* try to lock all the leafs of the subtree */
 	lock_all_subtree(handle);
 
@@ -360,22 +352,16 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 }
 #endif
 
-/* 
- * Try to free some memory on the specified node
- * 	returns 0 if no memory was released, 1 else
+/*
+ * Liberate the memory chuncks that are explicitely tagged to be liberated. The
+ * mc_rwlock[node] rw-lock should be taken prior to calling this function.
  */
-static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)), unsigned attempts)
+static size_t perform_mc_removal_requests(uint32_t node)
 {
-//	fprintf(stderr, "reclaim memory...\n");
-
-	int res;
+	starpu_mem_chunk_t mc, next_mc;
+	
 	size_t liberated = 0;
 
-	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
-	STARPU_ASSERT(!res);
-
-	/* remove all buffers for which there was a removal request */
-	starpu_mem_chunk_t mc, next_mc;
 	for (mc = starpu_mem_chunk_list_begin(mc_list_to_free[node]);
 	     mc != starpu_mem_chunk_list_end(mc_list_to_free[node]);
 	     mc = next_mc)
@@ -390,7 +376,21 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 		starpu_mem_chunk_delete(mc);
 	}
 
-	/* try to free all allocated data potentially in use .. XXX */
+	return liberated;
+}
+
+/*
+ * Try to liberate the buffers currently in use on the memory node. If the
+ * force flag is set, the memory is liberated regardless of coherency concerns
+ * (this should only be used at the termination of StarPU for instance). The
+ * mc_rwlock[node] rw-lock should be taken prior to calling this function.
+ */
+static size_t liberate_potentially_in_use_mc(uint32_t node, unsigned force)
+{
+	size_t liberated = 0;
+
+	starpu_mem_chunk_t mc, next_mc;
+
 	for (mc = starpu_mem_chunk_list_begin(mc_list[node]);
 	     mc != starpu_mem_chunk_list_end(mc_list[node]);
 	     mc = next_mc)
@@ -400,14 +400,63 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 		   element of the list now */
 		next_mc = starpu_mem_chunk_list_next(mc);
 
-		liberated += try_to_free_mem_chunk(mc, node, attempts);
-		#if 0
-		if (liberated > toreclaim)
-			break;
-		#endif
+		if (!force)
+		{
+			liberated += try_to_free_mem_chunk(mc, node);
+			#if 0
+			if (liberated > toreclaim)
+				break;
+			#endif
+		}
+		else {
+			/* We must liberate the memory now: note that data
+			 * coherency is not maintained in that case ! */
+			liberated += do_free_mem_chunk(mc, node);
+		}
 	}
+}
 
-//	fprintf(stderr, "got %d MB back\n", (int)liberated/(1024*1024));
+/* 
+ * Try to free some memory on the specified node
+ * 	returns 0 if no memory was released, 1 else
+ */
+
+static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unused)))
+{
+	int res;
+	size_t liberated = 0;
+
+	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	/* remove all buffers for which there was a removal request */
+	liberated += perform_mc_removal_requests(node);
+
+	/* try to free all allocated data potentially in use */
+	liberated += liberate_potentially_in_use_mc(node, 0);
+
+	res = pthread_rwlock_unlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	return liberated;
+}
+
+/*
+ * This function liberates all the memory that was implicitely allocated by
+ * StarPU (for the data replicates). This is not ensuring data coherency, and
+ * should only be called while StarPU is getting shut down.
+ */
+size_t _starpu_liberate_all_automatically_allocated_buffers(uint32_t node)
+{
+	int res;
+
+	size_t liberated = 0;
+
+	res = pthread_rwlock_wrlock(&mc_rwlock[node]);
+	STARPU_ASSERT(!res);
+
+	liberated += perform_mc_removal_requests(node);
+	liberated += liberate_potentially_in_use_mc(node, 1);
 
 	res = pthread_rwlock_unlock(&mc_rwlock[node]);
 	STARPU_ASSERT(!res);
@@ -415,6 +464,8 @@ static size_t reclaim_memory(uint32_t node, size_t toreclaim __attribute__ ((unu
 	return liberated;
 }
 
+
+
 static void register_mem_chunk(starpu_data_handle handle, uint32_t dst_node, size_t size, unsigned automatically_allocated)
 {
 	int res;
@@ -578,7 +629,7 @@ int _starpu_allocate_memory_on_node(starpu_data_handle handle, uint32_t dst_node
 			size_t data_size = handle->ops->get_size(handle);
 
 			STARPU_TRACE_START_MEMRECLAIM(dst_node);
-			reclaim_memory(dst_node, 2*data_size, attempts);
+			reclaim_memory(dst_node, 2*data_size);
 			STARPU_TRACE_END_MEMRECLAIM(dst_node);
 		}
 		

+ 1 - 0
src/datawizard/memalloc.h

@@ -48,5 +48,6 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle handle, unsigned node);
 int _starpu_allocate_memory_on_node(starpu_data_handle handle, uint32_t dst_node, unsigned may_alloc);
+size_t _starpu_liberate_all_automatically_allocated_buffers(uint32_t node);
 
 #endif

+ 5 - 0
src/drivers/cpu/driver_cpu.c

@@ -193,6 +193,11 @@ void *_starpu_cpu_worker(void *arg)
 
 	STARPU_TRACE_WORKER_DEINIT_START
 
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_liberate_all_automatically_allocated_buffers(memnode);
+
 #ifdef STARPU_DATA_STATS
 	fprintf(stderr, "CPU #%d computation %le comm %le (%lf \%%)\n", cpu_arg->id, cpu_arg->jobq->total_computation_time, cpu_arg->jobq->total_communication_time,  cpu_arg->jobq->total_communication_time*100.0/cpu_arg->jobq->total_computation_time);
 #endif

+ 5 - 0
src/drivers/cuda/driver_cuda.c

@@ -278,6 +278,11 @@ void *_starpu_cuda_worker(void *arg)
 
 	STARPU_TRACE_WORKER_DEINIT_START
 
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_liberate_all_automatically_allocated_buffers(memnode);
+
 	deinit_context(args->workerid);
 
 #ifdef STARPU_DATA_STATS