vor 7 Jahren · 828048bbb0
--- a/ChangeLog
+++ b/ChangeLog
@@ -52,6 +52,9 @@ New features:
 
				     trigger the task termination.
			
 
				   * Add possibility to define the sequential consistency at the task level
			
 
				     for each handle used by the task.
			
 
				+  * Add STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU, and
			
 
				+    STARPU_SPECIFIC_NODE_SLOW as generic values for codelet specific memory
			
 
				+    nodes which can be used instead of exact node numbers.
			
 
				 
			
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -840,8 +840,16 @@ main memory instead of copied in the GPU, a pivoting vector for instance.
 
				 This can be achieved by setting the starpu_codelet::specific_nodes flag to
			
 
				 <c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
			
 
				 starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
			
 
				-where data should be copied to, or <c>-1</c> to let StarPU copy it to the memory node
			
 
				-where the task will be executed. For instance, with the following codelet:
			
 
				+where data should be copied to, or <c>STARPU_SPECIFIC_NODE_LOCAL</c> to let
			
 
				+StarPU copy it to the memory node where the task will be executed.
			
 
				+
			
 
				+<c>STARPU_SPECIFIC_NODE_CPU</c> can also be used to request data to be
			
 
				+put in CPU-accessible memory (and let StarPU choose the NUMA node).
			
 
				+<c>STARPU_SPECIFIC_NODE_FAST</c> and <c>STARPU_SPECIFIC_NODE_SLOW</c> can als be
			
 
				+used
			
 
				+
			
 
				+For instance,
			
 
				+with the following codelet:
			
 
				 
			
 
				 \code{.c}
			
 
				 struct starpu_codelet cl =
			
@@ -850,12 +858,31 @@ struct starpu_codelet cl =
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_RW},
			
 
				 	.specific_nodes = 1,
			
 
				-	.nodes = {STARPU_MAIN_RAM, -1},
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
			
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-the first data of the task will be kept in the main memory, while the second
			
 
				+the first data of the task will be kept in the CPU memory, while the second
			
 
				 data will be copied to the CUDA GPU as usual. A working example is available in
			
 
				 <c>tests/datawizard/specific_node.c</c>
			
 
				 
			
 
				+With the following codelet:
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cuda_funcs = { kernel },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_RW, STARPU_RW},
			
 
				+	.specific_nodes = 1,
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_SLOW},
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+The first data will be copied into fast (but probably size-limited) local memory
			
 
				+while the second data will be left in slow (but large) memory. This makes sense
			
 
				+when the kernel does not make so many accesses to the second data, and thus data
			
 
				+being remote e.g. over a PCI bus is not a performance problem, and avoids
			
 
				+filling the fast local memory with data which does not need the performance.
			
 
				+
			
 
				 */
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -91,6 +91,9 @@ typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 
				 
			
 
				 #define STARPU_VARIABLE_NBUFFERS (-1)
			
 
				 
			
 
				+#define STARPU_SPECIFIC_NODE_LOCAL (-1)
			
 
				+#define STARPU_SPECIFIC_NODE_CPU (-2)
			
 
				+#define STARPU_SPECIFIC_NODE_SLOW (-3)
			
 
				 struct starpu_task;
			
 
				 struct starpu_codelet
			
 
				 {
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -591,7 +591,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 			/* Make sure handles are not partitioned */
			
 
				 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
			
 
				 			/* Make sure the specified node exists */
			
 
				-			STARPU_ASSERT_MSG(node == -1 || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
			
 
				+			STARPU_ASSERT_MSG(node == STARPU_SPECIFIC_NODE_LOCAL || node == STARPU_SPECIFIC_NODE_CPU || node == STARPU_SPECIFIC_NODE_SLOW || (node >= 0 && node < (int) starpu_memory_nodes_get_count()), "The codelet-specified memory node does not exist");
			
 
				 			/* Provide the home interface for now if any,
			
 
				 			 * for can_execute hooks */
			
 
				 			if (handle->home_node != -1)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -108,20 +108,30 @@ struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 
				 #endif
			
 
				 
			
 
				 /* Avoid using this one, prefer _starpu_task_data_get_node_on_worker */
			
 
				-int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index, unsigned target_node)
			
 
				+int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index, unsigned local_node)
			
 
				 {
			
 
				-	/* TODO: choose between DDR and MCDRAM according to codelet preference over bandwidth */
			
 
				-	int node = -1;
			
 
				+	int node = STARPU_SPECIFIC_NODE_LOCAL;
			
 
				 	if (task->cl->specific_nodes)
			
 
				 		node = STARPU_CODELET_GET_NODE(task->cl, index);
			
 
				-	if (node == -1)
			
 
				-		node = target_node;
			
 
				+	switch (node) {
			
 
				+	case STARPU_SPECIFIC_NODE_LOCAL:
			
 
				+		node = local_node;
			
 
				+		break;
			
 
				+	case STARPU_SPECIFIC_NODE_CPU:
			
 
				+		// TODO: rather take close NUMA node
			
 
				+		node = STARPU_MAIN_RAM;
			
 
				+		break;
			
 
				+	case STARPU_SPECIFIC_NODE_SLOW:
			
 
				+		// TODO: rather leave in DDR
			
 
				+		node = local_node;
			
 
				+		break;
			
 
				+	}
			
 
				 	return node;
			
 
				 }
			
 
				 
			
 
				 int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned index, unsigned worker)
			
 
				 {
			
 
				-	/* TODO: choose memory node according to proximity to worker and codelet preference over bandwidth */
			
 
				+	/* TODO: choose memory node according to proximity to worker rather than memory node */
			
 
				 	unsigned target_node = starpu_worker_get_memory_node(worker);
			
 
				 	return _starpu_task_data_get_node_on_node(task, index, target_node);
			
 
				 }
			
--- a/src/sched_policies/heteroprio.c
+++ b/src/sched_policies/heteroprio.c
@@ -639,8 +639,6 @@ done:		;
 
				 	/* if we have task (task) me way have some in the queue (worker->tasks_queue_size) that was freshly addeed (nb_added_tasks) */
			
 
				 	if(task && worker->tasks_queue.ntasks && nb_added_tasks && starpu_get_prefetch_flag())
			
 
				 	{
			
 
				-		const unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				-
			
 
				 /* TOTO berenger: iterate in the other sense */
			
 
				 		struct starpu_task *task_to_prefetch = NULL;
			
 
				 		for (task_to_prefetch  = starpu_task_prio_list_begin(&worker->tasks_queue.list);
			
--- a/src/sched_policies/parallel_heft.c
+++ b/src/sched_policies/parallel_heft.c
@@ -349,7 +349,6 @@ static int _parallel_heft_push_task(struct starpu_task *task, unsigned prio, uns
 
				 
			
 
				 			local_task_length[worker_ctx][nimpl] = starpu_task_expected_length(task, perf_arch,nimpl);
			
 
				 
			
 
				-			unsigned memory_node = starpu_worker_get_memory_node(workerid);
			
 
				 			local_data_penalty[worker_ctx][nimpl] = starpu_task_expected_data_transfer_time_for(task, workerid);
			
 
				 
			
 
				 			double ntasks_end = compute_ntasks_end(workerid, sched_ctx_id);
			
--- a/tests/datawizard/specific_node.c
+++ b/tests/datawizard/specific_node.c
@@ -35,7 +35,7 @@ unsigned data, data2;
 
				 void specific_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				 	(void)arg;
			
 
				-	STARPU_ASSERT(starpu_task_get_current_data_node(0) == STARPU_MAIN_RAM);
			
 
				+	STARPU_ASSERT(starpu_node_get_kind(starpu_task_get_current_data_node(0)) == STARPU_CPU_RAM);
			
 
				 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	STARPU_ASSERT(dataptr == &data);
			
@@ -50,7 +50,7 @@ static struct starpu_codelet specific_cl =
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_RW},
			
 
				 	.specific_nodes = 1,
			
 
				-	.nodes = {STARPU_MAIN_RAM, -1},
			
 
				+	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
			
 
				 };
			
 
				 
			
 
				 void cpu_codelet_unsigned_inc(void *descr[], void *arg)