|
|
@@ -840,8 +840,16 @@ main memory instead of copied in the GPU, a pivoting vector for instance.
|
|
|
This can be achieved by setting the starpu_codelet::specific_nodes flag to
|
|
|
<c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
|
|
|
starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
|
|
|
-where data should be copied to, or <c>-1</c> to let StarPU copy it to the memory node
|
|
|
-where the task will be executed. For instance, with the following codelet:
|
|
|
+where data should be copied to, or <c>STARPU_SPECIFIC_NODE_LOCAL</c> to let
|
|
|
+StarPU copy it to the memory node where the task will be executed.
|
|
|
+
|
|
|
+<c>STARPU_SPECIFIC_NODE_CPU</c> can also be used to request data to be
|
|
|
+put in CPU-accessible memory (and let StarPU choose the NUMA node).
|
|
|
+<c>STARPU_SPECIFIC_NODE_FAST</c> and <c>STARPU_SPECIFIC_NODE_SLOW</c> can als be
|
|
|
+used
|
|
|
+
|
|
|
+For instance,
|
|
|
+with the following codelet:
|
|
|
|
|
|
\code{.c}
|
|
|
struct starpu_codelet cl =
|
|
|
@@ -850,12 +858,31 @@ struct starpu_codelet cl =
|
|
|
.nbuffers = 2,
|
|
|
.modes = {STARPU_RW, STARPU_RW},
|
|
|
.specific_nodes = 1,
|
|
|
- .nodes = {STARPU_MAIN_RAM, -1},
|
|
|
+ .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
|
|
|
};
|
|
|
\endcode
|
|
|
|
|
|
-the first data of the task will be kept in the main memory, while the second
|
|
|
+the first data of the task will be kept in the CPU memory, while the second
|
|
|
data will be copied to the CUDA GPU as usual. A working example is available in
|
|
|
<c>tests/datawizard/specific_node.c</c>
|
|
|
|
|
|
+With the following codelet:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+struct starpu_codelet cl =
|
|
|
+{
|
|
|
+ .cuda_funcs = { kernel },
|
|
|
+ .nbuffers = 2,
|
|
|
+ .modes = {STARPU_RW, STARPU_RW},
|
|
|
+ .specific_nodes = 1,
|
|
|
+ .nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_SLOW},
|
|
|
+};
|
|
|
+\endcode
|
|
|
+
|
|
|
+The first data will be copied into fast (but probably size-limited) local memory
|
|
|
+while the second data will be left in slow (but large) memory. This makes sense
|
|
|
+when the kernel does not make so many accesses to the second data, and thus data
|
|
|
+being remote e.g. over a PCI bus is not a performance problem, and avoids
|
|
|
+filling the fast local memory with data which does not need the performance.
|
|
|
+
|
|
|
*/
|