Explorar o código

port of StarPU-1.2 rev 22589:
address bug report from Kevin Juilly regarding data replicates allocated directly on GPUs through prefetch and evicted due to low memory condition before having been initialized

Olivier Aumage %!s(int64=7) %!d(string=hai) anos
pai
achega
4d64202827
Modificáronse 2 ficheiros con 48 adicións e 12 borrados
  1. 30 3
      src/datawizard/coherency.c
  2. 18 9
      src/datawizard/memalloc.c

+ 30 - 3
src/datawizard/coherency.c

@@ -101,8 +101,14 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	{
 		/* Could estimate through cost, return that */
 		STARPU_ASSERT(handle->per_node[src_node].allocated);
-		STARPU_ASSERT(handle->per_node[src_node].initialized);
-		return src_node;
+		if (handle->per_node[src_node].initialized)
+		{
+			return src_node;
+		}
+		else
+		{
+			return -1;
+		}
 	}
 	
 	int i_ram = -1;
@@ -165,7 +171,10 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 
 	STARPU_ASSERT(src_node != -1);
 	STARPU_ASSERT(handle->per_node[src_node].allocated);
-	STARPU_ASSERT(handle->per_node[src_node].initialized);
+	if (!handle->per_node[src_node].initialized)
+	{
+		return -1;
+	}
 	return src_node;
 }
 
@@ -578,7 +587,25 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 	if (dst_replicate && mode & STARPU_R)
 	{
 		if (dst_replicate->state == STARPU_INVALID)
+		{
 			src_node = _starpu_select_src_node(handle, requesting_node);
+			if (src_node == -1 && requesting_node == STARPU_MAIN_RAM && !nwait)
+			{
+				/* And this is the main RAM, really no need for a
+				 * request, just allocate */
+				if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
+				{
+					_starpu_update_data_state(handle, dst_replicate, mode);
+
+					_starpu_spin_unlock(&handle->header_lock);
+
+					if (callback_func)
+						callback_func(callback_arg);
+					_STARPU_LOG_OUT_TAG("data immediately allocated");
+					return NULL;
+				}
+			}
+		}
 		else
 			src_node = requesting_node;
 		if (src_node < 0)

+ 18 - 9
src/datawizard/memalloc.c

@@ -260,15 +260,24 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
 			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
-			/* There is no way we don't need a request, since
-			 * source is OWNER, destination can't be having it */
-			STARPU_ASSERT(r);
-			/* Keep the handle alive while we are working on it */
-			handle->busy_count++;
-			_starpu_spin_unlock(&handle->header_lock);
-			_starpu_wait_data_request_completion(r, 1);
-			_starpu_spin_lock(&handle->header_lock);
-			handle->busy_count--;
+			/* r may be NULL if we are tidying a replicate that was
+			 * allocated on a GPU but has not yet been initialized
+			 */
+			if (r != NULL)
+			{
+				/* Keep the handle alive while we are working on it */
+				handle->busy_count++;
+				_starpu_spin_unlock(&handle->header_lock);
+				_starpu_wait_data_request_completion(r, 1);
+				_starpu_spin_lock(&handle->header_lock);
+				handle->busy_count--;
+			}
+			else
+			{
+				/* _starpu_create_request_to_fetch_data unlocks the handle
+				 * when bailing out */
+				_starpu_spin_lock(&handle->header_lock);
+			}
 			if (_starpu_data_check_not_busy(handle))
 				/* Actually disappeared, abort completely */
 				return -1;