| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281 | /* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. *//*! \page DataManagement Data ManagementTODO: intro which mentions consistency among other things\section DataInterface Data InterfaceStarPU provides several data interfaces for programmers to describethe data layout of their application. There are predefined interfacesalready available in StarPU. Users can define new data interfaces asexplained in \ref DefiningANewDataInterface. All functions provided byStarPU are documented in \ref API_Data_Interfaces. You will find ashort list below.\subsection VariableDataInterface Variable Data InterfaceA variable is a given-size byte element, typically a scalar. Here anexample of how to register a variable data to StarPU by usingstarpu_variable_data_register().\code{.c}float var = 42.0;starpu_data_handle_t var_handle;starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));\endcode\subsection VectorDataInterface Vector Data InterfaceA vector is a fixed number of elements of a given size. Here anexample of how to register a vector data to StarPU by usingstarpu_vector_data_register().\code{.c}float vector[NX];starpu_data_handle_t vector_handle;starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));\endcodeVectors can be partitioned into pieces by usingstarpu_vector_filter_block(). They can also be partitioned with some overlappingby using starpu_vector_filter_block_shadow(). By default StarPUuses the same size for each piece. If different sizes are desired,starpu_vector_filter_list() or starpu_vector_filter_list_long() can be usedinstead. To just divide in two pieces, starpu_vector_filter_divide_in_2() can be used.\subsection MatrixDataInterface Matrix Data InterfaceTo register 2-D matrices with a potential padding, one can use thematrix data interface. Here an example of how to register a matrixdata to StarPU by using starpu_matrix_data_register().\code{.c}float *matrix;starpu_data_handle_t matrix_handle;matrix = (float*)malloc(width * height * sizeof(float));starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));\endcode2D matrices can be partitioned into 2D matrices along the x dimension byusing starpu_matrix_filter_block(), and along the y dimension by usingstarpu_matrix_filter_vertical_block(). They can also be partitionedwith some overlapping by using starpu_matrix_filter_block_shadow() andstarpu_matrix_filter_vertical_block_shadow().\subsection BlockDataInterface Block Data InterfaceTo register 3-D matrices with potential paddings on Y and Z dimensions,one can use the block data interface. Here an example of how toregister a block data to StarPU by using starpu_block_data_register().\code{.c}float *block;starpu_data_handle_t block_handle;block = (float*)malloc(nx*ny*nz*sizeof(float));starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));\endcode3D matrices can be partitioned along the x dimension byusing starpu_block_filter_block(), or along the y dimensionby using starpu_block_filter_vertical_block(), or along thez dimension by using starpu_block_filter_depth_block(). Theycan also be partitioned with some overlapping by usingstarpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),or starpu_block_filter_depth_block_shadow().\subsection TensorDataInterface Tensor Data InterfaceTo register 4-D matrices with potential paddings on Y, Z, and T dimensions,one can use the tensor data interface. Here an example of how toregister a tensor data to StarPU by using starpu_tensor_data_register().\code{.c}float *block;starpu_data_handle_t block_handle;block = (float*)malloc(nx*ny*nz*nt*sizeof(float));starpu_tensor_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx*ny*nz, nx, ny, nz, nt, sizeof(float));\endcodePartitioning filters are not implemented yet.\subsection BCSRDataInterface BCSR Data InterfaceBCSR (Blocked Compressed Sparse Row Representation) sparse matrix datacan be registered to StarPU using the bcsr data interface. Here anexample on how to do so by using starpu_bcsr_data_register().\code{.c}/* * We use the following matrix: * *   +----------------+ *   |  0   1   0   0 | *   |  2   3   0   0 | *   |  4   5   8   9 | *   |  6   7  10  11 | *   +----------------+ * * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11] * colind = [0, 0, 1] * rowptr = [0, 1, 3] * r = c = 2 *//* Size of the blocks */int R = 2;int C = 2;int NROWS = 2;int NNZ_BLOCKS = 3;    /* out of 4 */int NZVAL_SIZE = (R*C*NNZ_BLOCKS);int nzval[NZVAL_SIZE]  ={	0, 1, 2, 3,    /* First block  */	4, 5, 6, 7,    /* Second block */	8, 9, 10, 11   /* Third block  */};uint32_t colind[NNZ_BLOCKS] ={	0, /* block-column index for first block in nzval */	0, /* block-column index for second block in nzval */	1  /* block-column index for third block in nzval */};uint32_t rowptr[NROWS+1] ={	0, / * block-index in nzval of the first block of the first row. */	1, / * block-index in nzval of the first block of the second row. */	NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */};starpu_data_handle_t bcsr_handle;starpu_bcsr_data_register(&bcsr_handle,			  STARPU_MAIN_RAM,			  NNZ_BLOCKS,			  NROWS,			  (uintptr_t) nzval,			  colind,			  rowptr,			  0, /* firstentry */			  R,			  C,			  sizeof(nzval[0]));\endcodeStarPU provides an example on how to deal with such matrices in<c>examples/spmv</c>.BCSR data handles can be partitioned into its dense matrix blocks by usingstarpu_bcsr_filter_canonical_block(), or split into other BCSR data handles byusing starpu_bcsr_filter_vertical_block() (but only split along the leading dimension issupported, i.e. along adjacent nnz blocks)\subsection CSRDataInterface CSR Data InterfaceTODOCSR data handles can be partitioned into vertical CSR matrices by usingstarpu_csr_filter_vertical_block().\subsection VariableSizeDataInterface Data Interface with Variable SizeTasks are actually allowed to change the size of data interfaces.The simplest case is just changing the amount of data actually used within theallocated buffer. This is for instance implemented for the matrix interface: onecan set the new NX/NY values with STARPU_MATRIX_SET_NX(), STARPU_MATRIX_SET_NY(), and STARPU_MATRIX_SET_LD()at the end of the task implementation. Data transfers achieved by StarPU willthen use these values instead of the whole allocated size. The values of courseneed to be set within the original allocation. To reserve room for increasingthe NX/NY values, one can use starpu_matrix_data_register_allocsize() instead ofstarpu_matrix_data_register(), to specify the allocation size to be used insteadof the default NX*NY*ELEMSIZE. To support this, the data interfacehas to implement the functions starpu_data_interface_ops::alloc_footprint andstarpu_data_interface_ops::alloc_compare, for proper StarPU allocationmanagement.A more involved case is changing the amount of allocated data.The task implementation can just reallocate the buffer during its execution, andset the proper new values in the interface structure, e.g. nx, ny, ld, etc. sothat the StarPU core knows the new data layout. The structure starpu_data_interface_opshowever then needs to have the field starpu_data_interface_ops::dontcacheset to 1, to prevent StarPU from trying to perform any cached allocation,since the allocated size will vary. An example is available in<c>tests/datawizard/variable_size.c</c>. The example uses its own datainterface so as to contain some simulation information for data growth, but theprinciple can be applied for any data interface.The principle is to use starpu_malloc_on_node_flags() to make the newallocation, and use starpu_free_on_node_flags() to release any previousallocation. The flags have to be precisely like in the example:\code{.c}unsigned workerid = starpu_worker_get_id_check();unsigned dst_node = starpu_worker_get_memory_node(workerid);interface->ptr = starpu_malloc_on_node_flags(dst_node, size + increase, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);starpu_free_on_node_flags(dst_node, old, size, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);interface->size += increase;\endcodeso that the allocated area has the expected properties and the allocation is accounted for properly.Depending on the interface (vector, CSR, etc.) you may have to fix severalmembers of the data interface: e.g. both <c>nx</c> and <c>allocsize</c> forvectors, and store the pointer both in <c>ptr</c> and <c>dev_handle</c>.Some interfaces make a distinction between the actual number of elementsstored in the data and the actually allocated buffer. For instance, the vectorinterface uses the <c>nx</c> field for the former, and the <c>allocsize</c> forthe latter. This allows for lazy reallocation to avoid reallocating the buffereverytime to exactly match the actual number of elements. Computations and datatransfers will use the field <c>nx</c>, while allocation functions will use the field<c>allocsize</c>. One just has to make sure that <c>allocsize</c> is alwaysbigger or equal to <c>nx</c>.Important note: one can not change the size of a partitioned data.\section DataManagement Data ManagementWhen the application allocates data, whenever possible it should usethe function starpu_malloc(), which will ask CUDA or OpenCL to makethe allocation itself and pin the corresponding allocated memory, or to use the functionstarpu_memory_pin() to pin memory allocated by other ways, such as local arrays. Thisis needed to permit asynchronous data transfer, i.e. permit datatransfer to overlap with computations. Otherwise, the trace will showthat the state <c>DriverCopyAsync</c> takes a lot of time, this isbecause CUDA or OpenCL then reverts to synchronous transfers.The application can provide its own allocation function by callingstarpu_malloc_set_hooks(). StarPU will then use them for all data handleallocations in the main memory.By default, StarPU leaves replicates of data wherever they were used, in case theywill be re-used by other tasks, thus saving the data transfer time. When sometask modifies some data, all the other replicates are invalidated, and only theprocessing unit which ran this task will have a valid replicate of the data. If the application knowsthat this data will not be re-used by further tasks, it should advise StarPU toimmediately replicate it to a desired list of memory nodes (given through abitmask). This can be understood like the write-through mode of CPU caches.\code{.c}starpu_data_set_wt_mask(img_handle, 1<<0);\endcodewill for instance request to always automatically transfer a replicate into themain memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.\code{.c}starpu_data_set_wt_mask(img_handle, ~0U);\endcodewill request to always automatically broadcast the updated data to all memorynodes.Setting the write-through mask to <c>~0U</c> can also be useful to make sure allmemory nodes always have a copy of the data, so that it is never evicted whenmemory gets scarse.Implicit data dependency computation can become expensive if a lotof tasks access the same piece of data. If no dependency is requiredon some piece of data (e.g. because it is only accessed in read-onlymode, or because write accesses are actually commutative), use thefunction starpu_data_set_sequential_consistency_flag() to disableimplicit dependencies on this data.In the same vein, accumulation of results in the same data can become abottleneck. The use of the mode ::STARPU_REDUX permits to optimize suchaccumulation (see \ref DataReduction). To a lesser extent, the use ofthe flag ::STARPU_COMMUTE keeps the bottleneck (see \ref DataCommute), but at least permitsthe accumulation to happen in any order.Applications often need a data just for temporary results.  In such a case,registration can be made without an initial value, for instance this produces a vector data:\code{.c}starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));\endcodeStarPU will then allocate the actual buffer only when it is actually needed,e.g. directly on the GPU without allocating in main memory.In the same vein, once the temporary results are not useful any more, thedata should be thrown away. If the handle is not to be reused, it can beunregistered:\code{.c}starpu_data_unregister_submit(handle);\endcodeactual unregistration will be done after all tasks working on the handleterminate.If the handle is to be reused, instead of unregistering it, it can simply be invalidated:\code{.c}starpu_data_invalidate_submit(handle);\endcodethe buffers containing the current value will then be freed, and reallocatedonly when another task writes some value to the handle.\section DataPrefetch Data PrefetchThe scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>perform data prefetch (see \ref STARPU_PREFETCH):as soon as a scheduling decision is taken for a task, requests are issued totransfer its required data to the target processing unit, if needed, so thatwhen the processing unit actually starts the task, its data will hopefully bealready available and it will not have to wait for the transfer to finish.The application may want to perform some manual prefetching, for several reasonssuch as excluding initial data transfers from performance measurements, orsetting up an initial statically-computed data distribution on the machinebefore submitting tasks, which will thus guide StarPU toward an initial taskdistribution (since StarPU will try to avoid further transfers).This can be achieved by giving the function starpu_data_prefetch_on_node() thehandle and the desired target memory node. The variantstarpu_data_idle_prefetch_on_node() can be used to issue the transferonly when the bus is idle.Conversely, one can advise StarPU that some data will not be useful in theclose future by calling starpu_data_wont_use(). StarPU will then write its valueback to its home node, and evict it from GPUs when room is needed.\section PartitioningData Partitioning DataAn existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:\code{.c}#define NX 1048576#define PARTS 16int vector[NX];starpu_data_handle_t handle;/* Declare data to StarPU */starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));/* Partition the vector in PARTS sub-vectors */struct starpu_data_filter f ={    .filter_func = starpu_vector_filter_block,    .nchildren = PARTS};starpu_data_partition(handle, &f);\endcodeThe task submission then uses the function starpu_data_get_sub_data()to retrieve the sub-handles to be passed as tasks parameters.\code{.c}/* Submit a task on each sub-vector */for (i=0; i<starpu_data_get_nb_children(handle); i++){    /* Get subdata number i (there is only 1 dimension) */    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);    struct starpu_task *task = starpu_task_create();    task->handles[0] = sub_handle;    task->cl = &cl;    task->synchronous = 1;    task->cl_arg = &factor;    task->cl_arg_size = sizeof(factor);    starpu_task_submit(task);}\endcodePartitioning can be applied several times, see<c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.Wherever the whole piece of data is already available, the partitioning willbe done in-place, i.e. without allocating new buffers but just using pointersinside the existing copy. This is particularly important to be aware of whenusing OpenCL, where the kernel parameters are not pointers, but \c cl_mem handles. Thekernel thus needs to be also passed the offset within the OpenCL buffer:\code{.c}void opencl_func(void *buffers[], void *cl_arg){    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);    ...    clSetKernelArg(kernel, 0, sizeof(vector), &vector);    clSetKernelArg(kernel, 1, sizeof(offset), &offset);    ...}\endcodeAnd the kernel has to shift from the pointer passed by the OpenCL driver:\code{.c}__kernel void opencl_kernel(__global int *vector, unsigned offset){    block = (__global void *)block + offset;    ...}\endcodeWhen the sub-data is not of the same type as the original data, the fieldstarpu_data_filter::get_child_ops needs to be set appropriately for StarPUto know which type should be used.StarPU provides various interfaces and filters for matrices, vectors, etc.,but applications can also write their own data interfaces and filters, see<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilterfor documentation.\section AsynchronousPartitioning Asynchronous PartitioningThe partitioning functions described in the previous section are synchronous:starpu_data_partition() and starpu_data_unpartition() both wait for all the taskscurrently working on the data.  This can be a bottleneck for the application.An asynchronous API also exists, it works only on handles with sequentialconsistency. The principle is to first plan the partitioning, which returnsdata handles of the partition, which are not functional yet. When submittingtasks, one can mix using the handles of the partition, of the whole data. Onecan even partition recursively and mix using handles at different levels of therecursion. Of course, StarPU will have to introduce coherency synchronization.<c>fmultiple_submit_implicit</c> is a complete example using this technique.One can also look at <c>fmultiple_submit_readonly</c> which contains theexplicit coherency synchronization which are automatically introduced by StarPUfor <c>fmultiple_submit_implicit</c>.In short, we first register a matrix and plan the partitioning:\code{.c}starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));struct starpu_data_filter f_vert ={	.filter_func = starpu_matrix_filter_block,	.nchildren = PARTS};starpu_data_partition_plan(handle, &f_vert, vert_handle);\endcodestarpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.One can then submit tasks working on the main handle, and tasks working on the handles<c>vert_handle</c>. Between using the main handle and the handles <c>vert_handle</c>, StarPU will automatically call starpu_data_partition_submit() andstarpu_data_unpartition_submit().All this code is asynchronous, just submitting which tasks, partitioning andunpartitioning will be done at runtime.Planning several partitioning of the same data is also possible, StarPU willunpartition and repartition as needed when mixing accesses of differentpartitions. If data access is done in read-only mode, StarPU will allow thedifferent partitioning to coexist. As soon as a data is accessed in read-writemode, StarPU will automatically unpartition everything and activate only thepartitioning leading to the data being written to.For instance, for a stencil application, one can split a subdomain intoits interior and halos, and then just submit a task updating the wholesubdomain, then submit MPI sends/receives to update the halos, then submitagain a task updating the whole subdomain, etc. and StarPU will automaticallypartition/unpartition each time.\section ManualPartitioning Manual PartitioningOne can also handle partitioning by hand, by registering several views on thesame piece of data. The idea is then to manage the coherency of the variousviews through the common buffer in the main memory.<c>fmultiple_manual</c> is a complete example using this technique.In short, we first register the same matrix several times:\code{.c}starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));for (i = 0; i < PARTS; i++)	starpu_matrix_data_register(&vert_handle[i], STARPU_MAIN_RAM, (uintptr_t)&matrix[0][i*(NX/PARTS)], NX, NX/PARTS, NY, sizeof(matrix[0][0]));\endcodeSince StarPU is not aware that the two handles are actually pointing to the samedata, we have a danger of inadvertently submitting tasks to both views, whichwill bring a mess since StarPU will not guarantee any coherency between the twoviews.  To make sure we don't do this, we invalidate the view that we will notuse:\code{.c}for (i = 0; i < PARTS; i++)	starpu_data_invalidate(vert_handle[i]);\endcodeThen we can safely work on <c>handle</c>.When we want to switch to the vertical slice view, all we need to do is bringcoherency between them by running an empty task on the home node of the data:\code{.c}void empty(void *buffers[], void *cl_arg){ }struct starpu_codelet cl_switch ={	.cpu_funcs = {empty},	.nbuffers = STARPU_VARIABLE_NBUFFERS,};ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,			STARPU_W, vert_handle[0],			STARPU_W, vert_handle[1],			0);\endcodeThe execution of the task <c>switch</c> will get back the matrix data into themain memory, and thus the vertical slices will get the updated value there.Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:\code{.c}starpu_data_invalidate_submit(handle);\endcodeAnd now we can start using vertical slices, etc.\section DataPointers Handles data buffer pointersA simple understanding of StarPU handles is that it's a collection of buffers oneach memory node of the machine, which contain the same data.  The picture ishowever made more complex with the OpenCL support and with partitioning.When partitioning a handle, the data buffers of the subhandles will indeedbe inside the data buffers of the main handle (to save transferring databack and forth between the main handle and the subhandles). But in OpenCL,a <c>cl_mem</c> is not a pointer, but an opaque value on which pointerarithmetic can not be used. That is why data interfaces contain three fields:<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>.<ul><li> The field <c>dev_handle</c> is what the allocation functionreturned, and one can not do arithmetic on it.</li><li> The field <c>offset</c> is the offset inside the allocated area,most often it will be 0 because data start at the beginning of theallocated area, but when the handle is partitioned, the subhandleswill have varying <c>offset</c> values, for each subpiece.</li><li> The field <c>ptr</c>, in the non-OpenCL case, i.e. when pointerarithmetic can be used on <c>dev_handle</c>, is just the sum of<c>dev_handle</c> and <c>offset</c>, provided for convenience.</li></ul>This means that:<ul><li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li><li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li><li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node() in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li><li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li></ul>\section DefiningANewDataFilter Defining A New Data FilterStarPU provides a series of predefined filters in \ref API_Data_Partition, butadditional filters can be defined by the application. The principle is that thefilter function just fills the memory location of the <c>i-th</c> subpart of a data.Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,check \ref starpu_data_filter::filter_func for further details.The helper function starpu_filter_nparts_compute_chunk_size_and_offset() can be used tocompute the division of pieces of data.\section DataReduction Data ReductionIn various cases, some piece of data is used to accumulate intermediateresults. For instances, the dot product of a vector, maximum/minimum finding,the histogram of a photograph, etc. When these results are produced along thewhole machine, it would not be efficient to accumulate them in only one place,incurring data transmission each and access concurrency.StarPU provides a mode ::STARPU_REDUX, which permits to optimizethis case: it will allocate a buffer on each worker (lazily), and accumulateintermediate results there. When the data is eventually accessed in the normalmode ::STARPU_R, StarPU will collect the intermediate results in just onebuffer.For this to work, the user has to use the functionstarpu_data_set_reduction_methods() to declare how to initialize thesebuffers, and how to assemble partial results.For instance, <c>cg</c> uses that to optimize its dot product: it first definesthe codelets for initialization and reduction:\code{.c}struct starpu_codelet bzero_variable_cl ={        .cpu_funcs = { bzero_variable_cpu },        .cpu_funcs_name = { "bzero_variable_cpu" },        .cuda_funcs = { bzero_variable_cuda },        .nbuffers = 1,}static void accumulate_variable_cpu(void *descr[], void *cl_arg){        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);        *v_dst = *v_dst + *v_src;}static void accumulate_variable_cuda(void *descr[], void *cl_arg){        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);        cudaStreamSynchronize(starpu_cuda_get_local_stream());}struct starpu_codelet accumulate_variable_cl ={        .cpu_funcs = { accumulate_variable_cpu },        .cpu_funcs_name = { "accumulate_variable_cpu" },        .cuda_funcs = { accumulate_variable_cuda },        .nbuffers = 2,	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},}\endcodeand attaches them as reduction methods for its handle <c>dtq</c>:\code{.c}starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);\endcodeand <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for thedot products with partitioned vectors:\code{.c}for (b = 0; b < nblocks; b++)    starpu_task_insert(&dot_kernel_cl,        STARPU_REDUX, dtq_handle,        STARPU_R, starpu_data_get_sub_data(v1, 1, b),        STARPU_R, starpu_data_get_sub_data(v2, 1, b),        0);\endcodeDuring registration, we have here provided <c>NULL</c>, i.e. there isno initial value to be taken into account during reduction. StarPUwill thus only take into account the contributions from the tasks<c>dot_kernel_cl</c>. Also, it will not allocate any memory for<c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.If another dot product has to be performed, one could unregister<c>dtq_handle</c>, and re-register it. But one can also callstarpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,which will clear all data from the handle, thus resetting it back tothe initial status <c>register(NULL)</c>.The example <c>cg</c> also uses reduction for the blocked gemv kernel,leading to yet more relaxed dependencies and more parallelism.::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPIcase. This will however not produce any MPI communication, but just pass::STARPU_REDUX to the underlying starpu_task_insert(). It is up to theapplication to call starpu_mpi_redux_data(), which posts tasks which willreduce the partial results among MPI nodes into the MPI node which owns thedata. For instance, some hypothetical application which collects partial resultsinto data <c>res</c>, then uses it for other computation, before looping againwith a new reduction:\code{.c}for (i = 0; i < 100; i++){    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);    starpu_mpi_redux_data(MPI_COMM_WORLD, res);    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);}\endcode\section DataCommute Commute Data AccessBy default, the implicit dependencies computed from data access use thesequential semantic. Notably, write accesses are always serialized in the orderof submission. In some applicative cases, the write contributions can actuallybe performed in any order without affecting the eventual result. In this caseit is useful to drop the strictly sequential semantic, to improve parallelismby allowing StarPU to reorder the write accesses. This can be done by usingthe data access flag ::STARPU_COMMUTE. Accesses without this flag will howeverproperly be serialized against accesses with this flag. For instance:\code{.c}    starpu_task_insert(&cl1, STARPU_R, h, STARPU_RW, handle, 0);    starpu_task_insert(&cl2, STARPU_R, handle1, STARPU_RW|STARPU_COMMUTE, handle, 0);    starpu_task_insert(&cl2, STARPU_R, handle2, STARPU_RW|STARPU_COMMUTE, handle, 0);    starpu_task_insert(&cl3, STARPU_R, g, STARPU_RW, handle, 0);\endcodeThe two tasks running <c>cl2</c> will be able to commute: depending on whether thevalue of <c>handle1</c> or <c>handle2</c> becomes available first, the corresponding taskrunning <c>cl2</c> will start first. The task running <c>cl1</c> will however always be runbefore them, and the task running <c>cl3</c> will always be run after them.If a lot of tasks use the commute access on the same set of data and a lot ofthem are ready at the same time, it may become interesting to use an arbiter,see \ref ConcurrentDataAccess.\section ConcurrentDataAccess Concurrent Data AccessesWhen several tasks are ready and will work on several data, StarPU is faced withthe classical Dining Philosophers problem, and has to determine the order inwhich it will run the tasks.Data accesses usually use sequential ordering, so data accesses are usuallyalready serialized, and thus by default StarPU uses the Dijkstra solution whichscales very well in terms of overhead: tasks will just acquire data one by oneby data handle pointer value order.When sequential ordering is disabled or the flag ::STARPU_COMMUTE is used, theremay be a lot of concurrent accesses to the same data, and the Dijkstra solutiongets only poor parallelism, typically in some pathological cases which do happenin various applications. In this case, one can use a data access arbiter, whichimplements the classical centralized solution for the Dining Philosophersproblem. This is more expensive in terms of overhead since it is centralized,but it opportunistically gets a lot of parallelism. The centralization can alsobe avoided by using several arbiters, thus separating sets of data for whicharbitration will be done.  If a task accesses data from different arbiters, itwill acquire them arbiter by arbiter, in arbiter pointer value order.See the <c>tests/datawizard/test_arbiter.cpp</c> example.Arbiters however do not support the flag ::STARPU_REDUX yet.\section TemporaryBuffers Temporary BuffersThere are two kinds of temporary buffers: temporary data which just pass resultsfrom a task to another, and scratch data which are needed only internally bytasks.\subsection TemporaryData Temporary DataData can sometimes be entirely produced by a task, and entirely consumed byanother task, without the need for other parts of the application to accessit. In such case, registration can be done without prior allocation, by usingthe special memory node number <c>-1</c>, and passing a zero pointer. StarPU willactually allocate memory only when the task creating the content gets scheduled,and destroy it on unregistration.In addition to this, it can be tedious for the application to have to unregisterthe data, since it will not use its content anyway. The unregistration can bedone lazily by using the function starpu_data_unregister_submit(),which will record that no more tasks accessing the handle will be submitted, sothat it can be freed as soon as the last task accessing it is over.The following code examplifies both points: it registers the temporarydata, submits three tasks accessing it, and records the data for automaticunregistration.\code{.c}starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));starpu_task_insert(&produce_data, STARPU_W, handle, 0);starpu_task_insert(&compute_data, STARPU_RW, handle, 0);starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);starpu_data_unregister_submit(handle);\endcodeThe application may also want to see the temporary data initializedon the fly before being used by the task. This can be done by usingstarpu_data_set_reduction_methods() to set an initialization codelet (no reduxcodelet is needed).\subsection ScratchData Scratch DataSome kernels sometimes need temporary data to achieve the computations, i.e. aworkspace. The application could allocate it at the start of the codeletfunction, and free it at the end, but this would be costly. It could alsoallocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),but this wouldmake them systematic and permanent. A more  optimized way is to usethe data access mode ::STARPU_SCRATCH, as examplified below, whichprovides per-worker buffers without content consistency. The buffer isregistered only once, using memory node <c>-1</c>, i.e. the application didn't allocatememory for it, and StarPU will allocate it on demand at task execution.\code{.c}starpu_vector_data_register(&workspace, -1, 0, sizeof(float));for (i = 0; i < N; i++)    starpu_task_insert(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);\endcodeStarPU will make sure that the buffer is allocated before executing the task,and make this allocation per-worker: for CPU workers, notably, each worker hasits own buffer. This means that each task submitted above will actually have itsown workspace, which will actually be the same for all tasks running one afterthe other on the same worker. Also, if for instance memory becomes scarce,StarPU will notice that it can free such buffers easily, since the content doesnot matter.The example <c>examples/pi</c> uses scratches for some temporary buffer.\section TheMultiformatInterface The Multiformat InterfaceIt may be interesting to represent the same piece of data using two differentdata structures: one only used on CPUs, and one only used on GPUs.This can be done by using the multiformat interface. StarPUwill be able to convert data from one data structure to the other when needed.Note that the scheduler <c>dmda</c> is the only one optimized for thisinterface. The user must provide StarPU with conversion codelets:\snippet multiformat.c To be included. You should update doxygen if you see this text.Kernels can be written almost as for any other interface. Note that::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernelsmust use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX maybe used in any kind of kernel.\code{.c}static voidmultiformat_scal_cpu_func(void *buffers[], void *args){    struct point *aos;    unsigned int n;    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);    ...}extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args){    unsigned int n;    struct struct_of_arrays *soa;    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);    ...}\endcodeA full example may be found in <c>examples/basic_examples/multiformat.c</c>.\section DefiningANewDataInterface Defining A New Data InterfaceThis section proposes an example how to define your own interface, when theStarPU-provided interface do not fit your needs. Here we take a dumb example ofan array of complex numbers represented by two arrays of double values.Let's thus define a new data interface to manage arrays of complex numbers:\code{.c}/* interface for complex numbers */struct starpu_complex_interface{        double *real;        double *imaginary;        int nx;};\endcodeThat structure stores enough to describe <b>one</b> buffer of such kind ofdata. It is used for the buffer stored in the main memory, another instanceis used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus acollection of such structures, to describe each buffer on each memory node.Note: one should not take pointers into such structures, because StarPU needsto be able to copy over the content of it to various places, for instance toefficiently migrate a data buffer from one data handle to another data handle.\subsection DefiningANewDataInterface_registration Data registrationRegistering such a data to StarPU is easily done using the functionstarpu_data_register(). The lastparameter of the function, <c>interface_complex_ops</c>, will bedescribed below.\code{.c}void starpu_complex_data_register(starpu_data_handle_t *handleptr,     unsigned home_node, double *real, double *imaginary, int nx){        struct starpu_complex_interface complex =        {                .real = real,                .imaginary = imaginary,                .nx = nx        };        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)        {                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();        }        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);}\endcodeThe <c>struct starpu_complex_interface complex</c> is here used just to store theparameters that the user provided to <c>starpu_complex_data_register</c>.starpu_data_register() will first allocate the handle, andthen pass the structure <c>starpu_complex_interface</c> to the methodstarpu_data_interface_ops::register_data_handle, which records themwithin the data handle (it is called once per node by starpu_data_register()):\code{.c}static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface){	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;	unsigned node;	for (node = 0; node < STARPU_MAXNODES; node++)	{		struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *)			starpu_data_get_interface_on_node(handle, node);		local_interface->nx = complex_interface->nx;		if (node == home_node)		{			local_interface->real = complex_interface->real;			local_interface->imaginary = complex_interface->imaginary;		}		else		{			local_interface->real = NULL;			local_interface->imaginary = NULL;		}	}}\endcodeIf the application provided a home node, the corresponding pointers will berecorded for that node. Others have no buffer allocated yet.Different operations need to be defined for a data interface throughthe type starpu_data_interface_ops. We only define here the basicoperations needed to run simple applications. The source code for thedifferent functions can be found in the file<c>examples/interface/complex_interface.c</c>, the details of the hooks to beprovided are documented in \ref starpu_data_interface_ops .\code{.c}static struct starpu_data_interface_ops interface_complex_ops ={        .register_data_handle = complex_register_data_handle,        .allocate_data_on_node = complex_allocate_data_on_node,        .copy_methods = &complex_copy_methods,        .get_size = complex_get_size,        .footprint = complex_footprint,        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,        .interface_size = sizeof(struct starpu_complex_interface),};\endcodeConvenience functions can defined to access the different fields of thecomplex interface from a StarPU data handle after a call to starpu_data_acquire():\code{.c}double *starpu_complex_get_real(starpu_data_handle_t handle){        struct starpu_complex_interface *complex_interface =          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);        return complex_interface->real;}double *starpu_complex_get_imaginary(starpu_data_handle_t handle);int starpu_complex_get_nx(starpu_data_handle_t handle);\endcodeSimilar functions need to be defined to access the different fields of thecomplex interface from a <c>void *</c> pointer to be used within codeletimplemetations.\snippet complex.c To be included. You should update doxygen if you see this text.Complex data interfaces can then be registered to StarPU.\code{.c}double real = 45.0;double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);starpu_task_insert(&cl_display, STARPU_R, handle1, 0);\endcodeand used by codelets.\code{.c}void display_complex_codelet(void *descr[], void *_args){        int nx = STARPU_COMPLEX_GET_NX(descr[0]);        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);        int i;        for(i=0 ; i<nx ; i++)        {                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);        }}\endcodeThe whole code for this complex data interface is available in thedirectory <c>examples/interface/</c>.\subsection DefiningANewDataInterface_allocation Data allocationTo be able to run tasks on GPUs etc. StarPU needs to know how to allocate abuffer for the interface. In our example, two allocations are needed in theallocation method \c complex_allocate_data_on_node(): one for the real part and onefor the imaginary part.\code{.c}static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsigned node){	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;	double *addr_real = NULL;	double *addr_imaginary = NULL;	starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);	addr_real = (double*) starpu_malloc_on_node(node, requested_memory);	if (!addr_real)		goto fail_real;	addr_imaginary = (double*) starpu_malloc_on_node(node, requested_memory);	if (!addr_imaginary)		goto fail_imaginary;	/* update the data properly in consequence */	complex_interface->real = addr_real;	complex_interface->imaginary = addr_imaginary;	return 2*requested_memory;fail_imaginary:	starpu_free_on_node(node, (uintptr_t) addr_real, requested_memory);fail_real:	return -ENOMEM;}\endcodeHere we try to allocate the two parts. If either of them fails, we return\c -ENOMEM. If they succeed, we can record the obtained pointers and returned theamount of allocated memory (for memory usage accounting).Conversely, \c complex_free_data_on_node() frees the two parts:\code{.c}static void complex_free_data_on_node(void *data_interface, unsigned node){	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;	starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);	starpu_free_on_node(node, (uintptr_t) complex_interface->real, requested_memory);	starpu_free_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);}\endcodeWe we have not made anything particular for GPUs or whatsoever: it isstarpu_free_on_node() which knows how to actually make the allocation, andreturns the resulting pointer, be it in main memory, in GPU memory, etc.\subsection DefiningANewDataInterface_copy Data copyNow that StarPU knows how to allocate/free a buffer, it needs to be able tocopy over data into/from it. Defining a method \c copy_any_to_any() allows StarPU toperform direct transfers between main memory and GPU memory.\code{.c}static int copy_any_to_any(void *src_interface, unsigned src_node,			   void *dst_interface, unsigned dst_node,			   void *async_data){	struct starpu_complex_interface *src_complex = src_interface;	struct starpu_complex_interface *dst_complex = dst_interface;	int ret = 0;	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,				    (uintptr_t) dst_complex->real, 0, dst_node,				     src_complex->nx*sizeof(src_complex->real[0]),				     async_data))		ret = -EAGAIN;	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,				    (uintptr_t) dst_complex->imaginary, 0, dst_node,				     src_complex->nx*sizeof(src_complex->imaginary[0]),				     async_data))		ret = -EAGAIN;	return ret;}\endcodeWe here again have no idea what is main memory or GPU memory, or even if thecopy is synchronous or asynchronous: we just call starpu_interface_copy()according to the interface, passing it the pointers, and checking whether itreturned \c -EAGAIN, which means the copy is asynchronous, and StarPU willappropriately wait for it thanks to the pointer \c async_data.This copy method is referenced in a structure \ref starpu_data_copy_methods\code{.c}static const struct starpu_data_copy_methods complex_copy_methods ={	.any_to_any = copy_any_to_any};\endcodewhich was referenced in the structure \ref starpu_data_interface_ops above.Other fields of \ref starpu_data_copy_methods allow to provide optimizedvariants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.\subsection DefiningANewDataInterface_pack Data pack/peek/unpackThe copy methods allow for RAM/GPU transfers, but is not enough for e.g.transferring over MPI. That requires defining the pack/peek/unpack methods. Theprinciple is that the method starpu_data_interface_ops::pack_data concatenatesthe buffer data into a newly-allocated contiguous bytes array, converselystarpu_data_interface_ops::peek_data extracts from a bytes array into thebuffer data, and starpu_data_interface_ops::unpack_data does the same asstarpu_data_interface_ops::peek_data but also frees the bytes array.\code{.c}static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count){	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)		starpu_data_get_interface_on_node(handle, node);	*count = complex_get_size(handle);	if (ptr != NULL)	{		char *data;		data = (void*) starpu_malloc_on_node_flags(node, *count, 0);		*ptr = data;		memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double));		memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));	}	return 0;}\endcode\c complex_pack_data() first computes the size to be allocated, then allocates it,and copies over into it the content of the two real and imaginary arrays.\code{.c}static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count){	char *data = ptr;	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)		starpu_data_get_interface_on_node(handle, node);	STARPU_ASSERT(count == 2 * complex_interface->nx * sizeof(double));	memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));	memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));	return 0;}\endcode\c complex_peek_data() simply uses \c memcpy() to copy over from the bytes array into the data buffer.\code{.c}static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count){	complex_peek_data(handle, node, ptr, count);	starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);	return 0;}\endcodeAnd \c complex_unpack_data() just calls \c complex_peek_data() and releases the bytes array.\section SpecifyingATargetNode Specifying A Target Node For Task DataWhen executing a task on a GPU for instance, StarPU would normally copy all theneeded data for the tasks on the embedded memory of the GPU.  It may howeverhappen that the task kernel would rather have some of the datas kept in themain memory instead of copied in the GPU, a pivoting vector for instance.This can be achieved by setting the flag starpu_codelet::specific_nodes to<c>1</c>, and then fill the array starpu_codelet::nodes (or starpu_codelet::dyn_nodes whenstarpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numberswhere data should be copied to, or ::STARPU_SPECIFIC_NODE_LOCAL to letStarPU copy it to the memory node where the task will be executed.::STARPU_SPECIFIC_NODE_CPU can also be used to request data to beput in CPU-accessible memory (and let StarPU choose the NUMA node).::STARPU_SPECIFIC_NODE_FAST and ::STARPU_SPECIFIC_NODE_SLOW can also beusedFor instance,with the following codelet:\code{.c}struct starpu_codelet cl ={	.cuda_funcs = { kernel },	.nbuffers = 2,	.modes = {STARPU_RW, STARPU_RW},	.specific_nodes = 1,	.nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},};\endcodethe first data of the task will be kept in the CPU memory, while the seconddata will be copied to the CUDA GPU as usual. A working example is available in<c>tests/datawizard/specific_node.c</c>With the following codelet:\code{.c}struct starpu_codelet cl ={	.cuda_funcs = { kernel },	.nbuffers = 2,	.modes = {STARPU_RW, STARPU_RW},	.specific_nodes = 1,	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_SLOW},};\endcodeThe first data will be copied into fast (but probably size-limited) local memorywhile the second data will be left in slow (but large) memory. This makes sensewhen the kernel does not make so many accesses to the second data, and thus databeing remote e.g. over a PCI bus is not a performance problem, and avoidsfilling the fast local memory with data which does not need the performance.In cases where the kernel is fine with some data being either local or in themain memory, ::STARPU_SPECIFIC_NODE_LOCAL_OR_CPU can be used. StarPU will thenbe free to leave the data in the main memory and let the kernel access it fromaccelerators, or to move it to the accelerator before starting the kernel, forinstance:\code{.c}struct starpu_codelet cl ={	.cuda_funcs = { kernel },	.nbuffers = 2,	.modes = {STARPU_RW, STARPU_R},	.specific_nodes = 1,	.nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},};\endcode*/
 |