瀏覽代碼

* Add starpu_data_wont_use to advise that a piece of data will not be used
in the close future.

Samuel Thibault 10 年之前
父節點
當前提交
7b2d1621c4

+ 2 - 0
ChangeLog

@@ -94,6 +94,8 @@ New features:
   * Anticipative writeback, to flush dirty data asynchronously before the
     GPU device is full. Disabled by default. Use STARPU_MINIMUM_CLEAN_BUFFERS
     and STARPU_TARGET_CLEAN_BUFFERS to enable it.
+  * Add starpu_data_wont_use to advise that a piece of data will not be used
+    in the close future.
 
 Small features:
   * Tasks can now have a name (via the field const char *name of

+ 4 - 0
doc/doxygen/chapters/07data_management.doxy

@@ -110,6 +110,10 @@ handle and the desired target memory node. The
 starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
 only when the bus is idle.
 
+Conversely, one can advise StarPU that some data will not be useful in the
+close future by calling starpu_data_wont_use. StarPU will then write its value
+back to its home node, and evict it from GPUs when room is needed.
+
 \section PartitioningData Partitioning Data
 
 An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:

+ 7 - 0
doc/doxygen/chapters/api/data_management.doxy

@@ -195,6 +195,13 @@ block until the transfer is achieved, else the call will return immediately,
 after having just queued the request. In the latter case, the request will
 asynchronously wait for the completion of any task writing on the data.
 
+\fn void star_data_wont_use(starpu_data_handle handle)
+\ingroup API_Data_Management
+Advise StarPU that this handle will not be used in the close future, and is
+thus a good candidate for eviction from GPUs. StarPU will thus write its value
+back to its home node when the bus is idle, and select this data in priority
+for eviction when memory gets low.
+
 \fn starpu_data_handle_t starpu_data_lookup(const void *ptr)
 \ingroup API_Data_Management
 Return the handle corresponding to the data pointed to by the \p ptr host pointer.

+ 3 - 1
examples/cholesky/cholesky_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2014  Université de Bordeaux
+ * Copyright (C) 2009-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
@@ -80,6 +80,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
+		starpu_data_wont_use(sdatakk);
 
 		for (j = k+1; j<nblocks; j++)
 		{
@@ -103,6 +104,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
 			}
+			starpu_data_wont_use(sdatakj);
 		}
 	}
 

+ 7 - 1
examples/lu/xlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2014  Université de Bordeaux
+ * Copyright (C) 2010-2011, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2015  Centre National de la Recherche Scientifique
  *
@@ -134,6 +134,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 		     ret = create_task_21(dataA, k, i);
 		     if (ret == -ENODEV) return ret;
 		}
+		starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, k));
 
 		for (i = k+1; i<nblocks; i++)
 		     for (j = k+1; j<nblocks; j++)
@@ -141,6 +142,11 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 			  ret = create_task_22(dataA, k, i, j);
 			  if (ret == -ENODEV) return ret;
 		     }
+		for (i = k+1; i<nblocks; i++)
+		{
+		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
+		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
+		}
 	}
 
 	/* stall the application until the end of computations */

+ 7 - 1
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2014  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -188,6 +188,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 		     ret = create_task_21(dataAp, nblocks, k, i, get_block);
 		     if (ret == -ENODEV) return ret;
 		}
+		starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, k));
 
 		for (i = k+1; i<nblocks; i++)
 		     for (j = k+1; j<nblocks; j++)
@@ -195,6 +196,11 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 			  ret = create_task_22(dataAp, nblocks, k, i, j, get_block);
 			  if (ret == -ENODEV) return ret;
 		     }
+		for (i = k+1; i<nblocks; i++)
+		{
+		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
+		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
+		}
 	}
 
 	/* stall the application until the end of computations */

+ 2 - 0
include/starpu_data.h

@@ -92,6 +92,8 @@ int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsign
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async);
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async);
 
+void starpu_data_wont_use(starpu_data_handle_t handle);
+
 #define STARPU_MAIN_RAM 0
 
 enum starpu_node_kind

+ 5 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -122,6 +122,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 					       0);
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
+			if (my_distrib(k, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][k]);
 
 			for (i = k+1; i<nblocks; i++)
 			{
@@ -139,6 +141,8 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			}
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
+			if (my_distrib(k, j, nodes) == rank)
+				starpu_data_wont_use(data_handles[k][j]);
 		}
 	}
 

+ 1 - 3
mpi/examples/mpi_lu/plu_example.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux
+ * Copyright (C) 2010-2011, 2013, 2015  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -67,8 +67,6 @@ static starpu_data_handle_t *(tmp_21_block_handles[2]);
 static TYPE **(tmp_21_block[2]);
 #endif
 
-int get_block_rank(unsigned i, unsigned j);
-
 static void parse_args(int rank, int argc, char **argv)
 {
 	int i;

+ 7 - 1
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013-2014  Université de Bordeaux
+ * Copyright (C) 2010-2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -140,6 +140,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 		}
 
 		starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,k));
+		if (get_block_rank(k, k) == _rank)
+			starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,k));
 
 		for (i = k+1; i<nblocks; i++)
 		{
@@ -152,7 +154,11 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 		for (i = k+1; i<nblocks; i++)
 		{
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(k,i));
+			if (get_block_rank(k, i) == _rank)
+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(k,i));
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, STARPU_PLU(get_block_handle)(i,k));
+			if (get_block_rank(i, k) == _rank)
+				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
 		}
 	}
 

+ 23 - 3
src/datawizard/memalloc.c

@@ -39,12 +39,10 @@ static struct _starpu_mem_chunk *mc_dirty_head[STARPU_MAXNODES];
 static unsigned mc_nb[STARPU_MAXNODES], mc_clean_nb[STARPU_MAXNODES];
 
 /* TODO: no home doesn't mean always clean, should push to larger memory nodes */
-/* TODO: REDUX always dirty */
-
 #define MC_LIST_PUSH_BACK(node, mc) do {				 \
 	_starpu_mem_chunk_list_push_back(mc_list[node], mc);		 \
 	if ((mc)->clean || (mc)->home)					 \
-		/* This is clean */				 \
+		/* This is clean */					 \
 		mc_clean_nb[node]++;					 \
 	else if (!mc_dirty_head[node])					 \
 		/* This is the only dirty element for now */		 \
@@ -52,6 +50,13 @@ static unsigned mc_nb[STARPU_MAXNODES], mc_clean_nb[STARPU_MAXNODES];
 	mc_nb[node]++;							 \
 } while(0)
 
+#define MC_LIST_PUSH_CLEAN(node, mc) do {				 \
+	_starpu_mem_chunk_list_push_front(mc_list[node], mc);		 \
+	/* This is clean */						 \
+	mc_clean_nb[node]++;						 \
+	mc_nb[node]++;							 \
+} while (0)
+
 #define MC_LIST_ERASE(node, mc) do {					 \
 	if ((mc)->clean || (mc)->home)					 \
 		mc_clean_nb[node]--; /* One clean element less */	 \
@@ -1330,6 +1335,21 @@ void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node)
 	_starpu_spin_unlock(&mc_lock[node]);
 }
 
+/* This memchunk will not be used in the close future, put it on the clean
+ * list, so we will to evict it first */
+void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *mc, unsigned node)
+{
+	if (!mc)
+		/* user-allocated memory */
+		return;
+	_starpu_spin_lock(&mc_lock[node]);
+	MC_LIST_ERASE(node, mc);
+	/* Caller will schedule a clean transfer */
+	mc->clean = 1;
+	MC_LIST_PUSH_CLEAN(node, mc);
+	_starpu_spin_unlock(&mc_lock[node]);
+}
+
 /* This memchunk is being written to, and thus becomes dirty */
 void _starpu_memchunk_dirty(struct _starpu_mem_chunk *mc, unsigned node)
 {

+ 1 - 0
src/datawizard/memalloc.h

@@ -81,6 +81,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
+void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
 void _starpu_memchunk_dirty(struct _starpu_mem_chunk *mc, unsigned node);
 
 void _starpu_display_memory_stats_by_node(int node);

+ 29 - 0
src/datawizard/user_interactions.c

@@ -462,6 +462,35 @@ int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node
 	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2);
 }
 
+static void _starpu_data_wont_use(void *data)
+{
+	unsigned node, worker, nworkers = starpu_worker_get_count();
+	starpu_data_handle_t handle = data;
+
+	_starpu_spin_lock(&handle->header_lock);
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_node[node];
+		if (local->allocated && local->automatically_allocated)
+			_starpu_memchunk_wont_use(local->mc, node);
+	}
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		struct _starpu_data_replicate *local = &handle->per_worker[node];
+		if (local->allocated && local->automatically_allocated)
+			_starpu_memchunk_wont_use(local->mc, node);
+	}
+	_starpu_spin_unlock(&handle->header_lock);
+	starpu_data_release_on_node(handle, -1);
+	if (handle->home_node != -1)
+		starpu_data_idle_prefetch_on_node(handle, handle->home_node, 1);
+}
+
+void starpu_data_wont_use(starpu_data_handle_t handle)
+{
+	starpu_data_acquire_on_node_cb(handle, -1, STARPU_R, _starpu_data_wont_use, handle);
+}
+
 /*
  *	It is possible to specify that a piece of data can be discarded without
  *	impacting the application.