瀏覽代碼

Memory reclaiming bug fix: while reclaiming memory, we should not hold the lock of the header describing the entire state of the handle while other handles are inspected to find available memory.

Cédric Augonnet 15 年之前
父節點
當前提交
dabb837619
共有 2 個文件被更改,包括 13 次插入4 次删除
  1. 3 3
      src/datawizard/data_request.c
  2. 10 1
      src/datawizard/memalloc.c

+ 3 - 3
src/datawizard/data_request.c

@@ -304,8 +304,11 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->allocated);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
 	STARPU_ASSERT(!(r_mode & STARPU_R) || src_replicate->refcnt);
 
 
+	_starpu_spin_unlock(&r->lock);
+
 	/* perform the transfer */
 	/* perform the transfer */
 	/* the header of the data must be locked by the worker that submitted the request */
 	/* the header of the data must be locked by the worker that submitted the request */
+
 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
 			dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
 			dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
 
 
@@ -313,9 +316,7 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 	{
 	{
 		/* If there was not enough memory, we will try to redo the
 		/* If there was not enough memory, we will try to redo the
 		 * request later. */
 		 * request later. */
-		_starpu_spin_unlock(&r->lock);
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
-
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
@@ -325,7 +326,6 @@ static int starpu_handle_data_request(starpu_data_request_t r, unsigned may_allo
 		 * immediatly. We will handle the completion of the request
 		 * immediatly. We will handle the completion of the request
 		 * asynchronously. The request is put in the list of "pending"
 		 * asynchronously. The request is put in the list of "pending"
 		 * requests in the meantime. */
 		 * requests in the meantime. */
-		_starpu_spin_unlock(&r->lock);
 		_starpu_spin_unlock(&handle->header_lock);
 		_starpu_spin_unlock(&handle->header_lock);
 
 
 		PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);
 		PTHREAD_MUTEX_LOCK(&data_requests_pending_list_mutex[r->handling_node]);

+ 10 - 1
src/datawizard/memalloc.c

@@ -693,10 +693,19 @@ static ssize_t _starpu_allocate_interface(starpu_data_handle handle, struct star
 		allocated_memory = handle->ops->allocate_data_on_node(replicate->interface, dst_node);
 		allocated_memory = handle->ops->allocate_data_on_node(replicate->interface, dst_node);
 		STARPU_TRACE_END_ALLOC(dst_node);
 		STARPU_TRACE_END_ALLOC(dst_node);
 
 
-		if (allocated_memory == -ENOMEM) {
+		if (allocated_memory == -ENOMEM)
+		{
+			replicate->refcnt++;
+			_starpu_spin_unlock(&handle->header_lock);
+
 			STARPU_TRACE_START_MEMRECLAIM(dst_node);
 			STARPU_TRACE_START_MEMRECLAIM(dst_node);
 			reclaim_memory_generic(dst_node, 0);
 			reclaim_memory_generic(dst_node, 0);
 			STARPU_TRACE_END_MEMRECLAIM(dst_node);
 			STARPU_TRACE_END_MEMRECLAIM(dst_node);
+
+		        while (_starpu_spin_trylock(&handle->header_lock))
+		                _starpu_datawizard_progress(_starpu_get_local_memory_node(), 0);
+		
+			replicate->refcnt--;
 		}
 		}
 		
 		
 	} while((allocated_memory == -ENOMEM) && attempts++ < 2);
 	} while((allocated_memory == -ENOMEM) && attempts++ < 2);