Browse Source

Add STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU, and STARPU_SPECIFIC_NODE_SLOW

as generic values for codelet specific memory nodes which can be used instead of exact node numbers.
Samuel Thibault 7 years ago
parent
commit
828048bbb0

+ 3 - 0
ChangeLog

@@ -52,6 +52,9 @@ New features:
     trigger the task termination.
   * Add possibility to define the sequential consistency at the task level
     for each handle used by the task.
+  * Add STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU, and
+    STARPU_SPECIFIC_NODE_SLOW as generic values for codelet specific memory
+    nodes which can be used instead of exact node numbers.
 
 Small features:
   * Scheduling contexts may now be associated a user data pointer at creation

+ 31 - 4
doc/doxygen/chapters/310_data_management.doxy

@@ -840,8 +840,16 @@ main memory instead of copied in the GPU, a pivoting vector for instance.
 This can be achieved by setting the starpu_codelet::specific_nodes flag to
 <c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
 starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
-where data should be copied to, or <c>-1</c> to let StarPU copy it to the memory node
-where the task will be executed. For instance, with the following codelet:
+where data should be copied to, or <c>STARPU_SPECIFIC_NODE_LOCAL</c> to let
+StarPU copy it to the memory node where the task will be executed.
+
+<c>STARPU_SPECIFIC_NODE_CPU</c> can also be used to request data to be
+put in CPU-accessible memory (and let StarPU choose the NUMA node).
+<c>STARPU_SPECIFIC_NODE_FAST</c> and <c>STARPU_SPECIFIC_NODE_SLOW</c> can als be
+used
+
+For instance,
+with the following codelet:
 
 \code{.c}
 struct starpu_codelet cl =
@@ -850,12 +858,31 @@ struct starpu_codelet cl =
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW},
 	.specific_nodes = 1,
-	.nodes = {STARPU_MAIN_RAM, -1},
+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
 };
 \endcode
 
-the first data of the task will be kept in the main memory, while the second
+the first data of the task will be kept in the CPU memory, while the second
 data will be copied to the CUDA GPU as usual. A working example is available in
 <c>tests/datawizard/specific_node.c</c>
 
+With the following codelet:
+
+\code{.c}
+struct starpu_codelet cl =
+{
+	.cuda_funcs = { kernel },
+	.nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW},
+	.specific_nodes = 1,
+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_SLOW},
+};
+\endcode
+
+The first data will be copied into fast (but probably size-limited) local memory
+while the second data will be left in slow (but large) memory. This makes sense
+when the kernel does not make so many accesses to the second data, and thus data
+being remote e.g. over a PCI bus is not a performance problem, and avoids
+filling the fast local memory with data which does not need the performance.
+
 */

+ 3 - 0
include/starpu_task.h

@@ -91,6 +91,9 @@ typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 
 #define STARPU_VARIABLE_NBUFFERS (-1)
 
+#define STARPU_SPECIFIC_NODE_LOCAL (-1)
+#define STARPU_SPECIFIC_NODE_CPU (-2)
+#define STARPU_SPECIFIC_NODE_SLOW (-3)
 struct starpu_task;
 struct starpu_codelet
 {

+ 1 - 1
src/core/task.c

@@ -591,7 +591,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 			/* Make sure handles are not partitioned */
 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
 			/* Make sure the specified node exists */
-			STARPU_ASSERT_MSG(node == -1 || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
+			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
 			/* Provide the home interface for now if any,
 			 * for can_execute hooks */
 			if (handle->home_node != -1)

+ 16 - 6
src/core/topology.c

@@ -108,20 +108,30 @@ struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 #endif
 
 /* Avoid using this one, prefer _starpu_task_data_get_node_on_worker */
-int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index, unsigned target_node)
+int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index, unsigned local_node)
 {
-	/* TODO: choose between DDR and MCDRAM according to codelet preference over bandwidth */
-	int node = -1;
+	int node = STARPU_SPECIFIC_NODE_LOCAL;
 	if (task->cl->specific_nodes)
 		node = STARPU_CODELET_GET_NODE(task->cl, index);
-	if (node == -1)
-		node = target_node;
+	switch (node) {
+	case STARPU_SPECIFIC_NODE_LOCAL:
+		node = local_node;
+		break;
+	case STARPU_SPECIFIC_NODE_CPU:
+		// TODO: rather take close NUMA node
+		node = STARPU_MAIN_RAM;
+		break;
+	case STARPU_SPECIFIC_NODE_SLOW:
+		// TODO: rather leave in DDR
+		node = local_node;
+		break;
+	}
 	return node;
 }
 
 int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned index, unsigned worker)
 {
-	/* TODO: choose memory node according to proximity to worker and codelet preference over bandwidth */
+	/* TODO: choose memory node according to proximity to worker rather than memory node */
 	unsigned target_node = starpu_worker_get_memory_node(worker);
 	return _starpu_task_data_get_node_on_node(task, index, target_node);
 }

+ 0 - 2
src/sched_policies/heteroprio.c

@@ -639,8 +639,6 @@ done:		;
 	/* if we have task (task) me way have some in the queue (worker->tasks_queue_size) that was freshly addeed (nb_added_tasks) */
 	if(task && worker->tasks_queue.ntasks && nb_added_tasks && starpu_get_prefetch_flag())
 	{
-		const unsigned memory_node = starpu_worker_get_memory_node(workerid);
-
 /* TOTO berenger: iterate in the other sense */
 		struct starpu_task *task_to_prefetch = NULL;
 		for (task_to_prefetch  = starpu_task_prio_list_begin(&worker->tasks_queue.list);

+ 0 - 1
src/sched_policies/parallel_heft.c

@@ -349,7 +349,6 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
 
-			unsigned memory_node = starpu_worker_get_memory_node(workerid);
 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
 
 			double ntasks_end = compute_ntasks_end(workerid, sched_ctx_id);

+ 2 - 2
tests/datawizard/specific_node.c

@@ -35,7 +35,7 @@ unsigned data, data2;
 void specific_kernel(void *descr[], void *arg)
 {
 	(void)arg;
-	STARPU_ASSERT(starpu_task_get_current_data_node(0) == STARPU_MAIN_RAM);
+	STARPU_ASSERT(starpu_node_get_kind(starpu_task_get_current_data_node(0)) == STARPU_CPU_RAM);
 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	STARPU_ASSERT(dataptr == &data);
@@ -50,7 +50,7 @@ static struct starpu_codelet specific_cl =
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW},
 	.specific_nodes = 1,
-	.nodes = {STARPU_MAIN_RAM, -1},
+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
 };
 
 void cpu_codelet_unsigned_inc(void *descr[], void *arg)