| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841 | \input texinfo @c -*-texinfo-*-@c %**start of header@setfilename starpu.info@settitle StarPU@c %**end of header@setchapternewpage odd@titlepage@title StarPU@page@vskip 0pt plus 1filll@comment For the @value{version-GCC} Version*@end titlepage@summarycontents@contents@page@node Top@top Preface@cindex PrefaceThis manual documents the usage of StarPU.@comment@comment  When you add a new menu item, please keep the right hand@comment  aligned to the same column.  Do not use tabs.  This provides@comment  better formatting.@comment@menu* Introduction::          A basic introduction to using StarPU* Installing StarPU::     How to configure, build and install StarPU* Configuration options:: Configurations options* Environment variables:: Environment variables used by StarPU* StarPU API::            The API to use StarPU* Basic Examples::        Basic examples of the use of StarPU* Advanced Topics::       Advanced use of StarPU@end menu@c ---------------------------------------------------------------------@c Introduction to StarPU@c ---------------------------------------------------------------------@node Introduction@chapter Introduction to StarPU@menu* Motivation::             Why StarPU ?* StarPU in a Nutshell::   The Fundamentals of StarPU@end menu@node Motivation@section Motivation@c complex machines with heterogeneous cores/devicesThe use of specialized hardware such as accelerators or coprocessors offers aninteresting approach to overcome the physical limits encountered by processorarchitects. As a result, many machines are now equipped with one or severalaccelerators (e.g. a GPU), in addition to the usual processor(s). While a lot ofefforts have been devoted to offload computation onto such accelerators, verylittle attention as been paid to portability concerns on the one hand, and to thepossibility of having heterogeneous accelerators and processors to interact on the other hand.StarPU is a runtime system that offers support for heterogeneous multicorearchitectures, it not only offers a unified view of the computational resources(i.e. CPUs and accelerators at the same time), but it also takes care ofefficiently mapping and executing tasks onto an heterogeneous machine whiletransparently handling low-level issues in a portable fashion.@c this leads to a complicated distributed memory design@c which is not (easily) manageable by hand@c added value/benefits of StarPU@c   - portability@c   - scheduling, perf. portability@node StarPU in a Nutshell@section StarPU in a NutshellFrom a programming point of view, StarPU is not a new language but a librarythat executes tasks explicitly submitted by the application.  The data that atask manipulates are automatically transferred onto the accelerator so that theprogrammer does not have to take care of complex data movements.  StarPU alsotakes particular care of scheduling those tasks efficiently and allowsscheduling experts to implement custom scheduling policies in a portablefashion.@c explain the notion of codelet and task (i.e. g(A, B)@subsection Codelet and TasksOne of StarPU primary data structure is the @b{codelet}. A codelet describes acomputational kernel that can possibly be implemented on multiple architecturessuch as a CPU, a CUDA device or a Cell's SPU.@c TODO insert illustration f : f_spu, f_cpu, ...Another important data structure is the @b{task}. Executing a StarPU taskconsists in applying a codelet on a data set, on one of the architectures onwhich the codelet is implemented. In addition to the codelet that a taskimplements, it also describes which data are accessed, and how they areaccessed during the computation (read and/or write).StarPU tasks are asynchronous: submitting a task to StarPU is a non-blockingoperation. The task structure can also specify a @b{callback} function that iscalled once StarPU has properly executed the task. It also contains optionalfields that the application may use to give hints to the scheduler (such aspriority levels).A task may be identified by a unique 64-bit number which we refer as a @b{tag}.Task dependencies can be enforced either by the means of callback functions, orby expressing dependencies between tags.@c TODO insert illustration f(Ar, Brw, Cr) + ..@c DSM@subsection StarPU Data Management LibraryBecause StarPU schedules tasks at runtime, data transfers have to bedone automatically and ``just-in-time'' between processing units,relieving the application programmer from explicit data transfers.Moreover, to avoid unnecessary transfers, StarPU keeps datawhere it was last needed, even if was modified there, and itallows multiple copies of the same data to reside at the same time onseveral processing units as long as it is not modified.@c ---------------------------------------------------------------------@c Installing StarPU@c ---------------------------------------------------------------------@node Installing StarPU@chapter Installing StarPU@menu* Configuration of StarPU::* Building and Installing StarPU::@end menuStarPU can be built and installed by the standard means of the GNUautotools. The following chapter is intended to briefly remind how these toolscan be used to install StarPU.@node Configuration of StarPU@section Configuration of StarPU@menu* Generating Makefiles and configuration scripts::* Configuring StarPU::@end menu@node Generating Makefiles and configuration scripts@subsection Generating Makefiles and configuration scriptsThis step is not necessary when using the tarball releases of StarPU.  If youare using the source code from the svn repository, you first need to generatethe configure scripts and the Makefiles.@example$ autoreconf -vfi@end example@node Configuring StarPU@subsection Configuring StarPU@example$ ./configure@end exampleDetails about options that are useful to give to @code{./configure} are given in@ref{Configuration options}.@node Building and Installing StarPU@section Building and Installing StarPU@menu* Building::* Sanity Checks::* Installing::* pkg-config configuration::@end menu@node Building@subsection Building@example$ make@end example@node Sanity Checks@subsection Sanity ChecksIn order to make sure that StarPU is working properly on the system, it is alsopossible to run a test suite.@example$ make check@end example@node Installing@subsection InstallingIn order to install StarPU at the location that was specified duringconfiguration:@example$ make install@end example@node pkg-config configuration@subsection pkg-config configurationIt is possible that compiling and linking an application against StarPUrequires to use specific flags or libraries (for instance @code{CUDA} or@code{libspe2}). To this end, it is possible to use the @code{pkg-config} tool.If StarPU was not installed at some standard location, the path of StarPU'slibrary must be specified in the @code{PKG_CONFIG_PATH} environment variable sothat @code{pkg-config} can find it. For example if StarPU was installed in@code{$prefix_dir}:@example$ PKG_CONFIG_PATH = $PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig@end exampleThe flags required to compile or link against StarPU are thenaccessible with the following commands:@example$ pkg-config --cflags libstarpu  # options for the compiler$ pkg-config --libs libstarpu    # options for the linker@end example@c ---------------------------------------------------------------------@c Configuration options@c ---------------------------------------------------------------------@node Configuration options@chapter Configuration options@table @asis@item @code{--disable-cpu}Disable the use of CPUs of the machine. Only GPUs etc. will be used.@item @code{--enable-maxcudadev=<number>}Defines the maximum number of CUDA devices that StarPU will support, thenavailable as the STARPU_MAXCUDADEVS macro.@item @code{--disable-cuda}Disable the use of CUDA, even if the SDK is detected.@item @code{--enable-maxopencldev=<number>}Defines the maximum number of OpenCL devices that StarPU will support, thenavailable as the STARPU_MAXOPENCLDEVS macro.@item @code{--disable-opencl}Disable the use of OpenCL, even if the SDK is detected.@item @code{--enable-gordon}Enable the use of the Gordon runtime for Cell SPUs.@c TODO: rather default to enabled when detected@item @code{--enable-debug}Enable debugging messages.@item @code{--enable-fast}Do not enforce assertions, saves a lot of time spent to compute them otherwise.@item @code{--enable-verbose}Augment the verbosity of the debugging messages.@item @code{--enable-coverage}Enable flags for the coverage tool.@item @code{--enable-perf-debug}Enable performance debugging.@item @code{--enable-model-debug}Enable performance model debugging.@item @code{--enable-stats}Enable statistics.@item @code{--enable-maxbuffers=<nbuffers>}Define the maximum number of buffers that tasks will be able to take as parameters, then available as the STARPU_NMAXBUFS macro.@item @code{--disable-priority}Disable taking priorities into account in scheduling decisions. Mostly forcomparison purposes.@item @code{--enable-allocation-cache}Enable the use of a data allocation cache to avoid the cost of it withCUDA. Still experimental.@item @code{--enable-opengl-render}Enable the use of OpenGL for the rendering of some examples.@c TODO: rather default to enabled when detected@item @code{--enable-blas-lib=<name>}Specify the blas library to be used by some of the examples. Thelibrary has to be 'atlas' or 'goto'.@item @code{--with-cuda-dir=<path>}Specify the location of the CUDA SDK resides. This directory should notably contain@code{include/cuda.h}.@item @code{--with-magma=<path>}Specify where magma is installed.@item @code{--with-opencl-dir=<path>}Specify the location of the OpenCL SDK. This directory should notably contain@code{include/CL/cl.h}.@item @code{--with-gordon-dir=<path>}Specify the location of the Gordon SDK.@item @code{--with-fxt=<path>}Specify the location of FxT (for generating traces and rendering themusing ViTE). This directory should notably contain@code{include/fxt/fxt.h}.@item @code{--with-perf-model-dir=<dir>}Specify where performance models should be stored (instead of defaulting to thecurrent user's home).@item @code{--with-mpicc=<path to mpicc>}Specify the location of the @code{mpicc} compiler to be used for starpumpi.@c TODO: also just use AC_PROG@item @code{--with-mpi}Enable building libstarpumpi.@c TODO: rather just use the availability of mpicc instead of a second option@item @code{--with-goto-dir=<dir>}Specify the location of GotoBLAS.@item @code{--with-atlas-dir=<dir>}Specify the location of ATLAS. This directory should notably contain@code{include/cblas.h}.@end table@c ---------------------------------------------------------------------@c Environment variables@c ---------------------------------------------------------------------@node Environment variables@chapter Environment variables@menu* Workers::     Configuring workers* Scheduling::  Configuring the Scheduling engine* Misc::        Miscellaneous and debug@end menuNote: the values given in @code{starpu_conf} structure passed whencalling @code{starpu_init} will override the values of the environmentvariables.@node Workers@section Configuring workers@menu* STARPU_NCPUS     :: Number of CPU workers* STARPU_NCUDA     :: Number of CUDA workers* STARPU_NOPENCL   :: Number of OpenCL workers* STARPU_NGORDON   :: Number of SPU workers (Cell)* STARPU_WORKERS_CPUID    :: Bind workers to specific CPUs* STARPU_WORKERS_CUDAID   :: Select specific CUDA devices* STARPU_WORKERS_OPENCLID :: Select specific OpenCL devices@end menu@node STARPU_NCPUS@subsection @code{STARPU_NCPUS} -- Number of CPU workers@table @asis@item @emph{Description}:Specify the maximum number of CPU workers. Note that StarPU will not allocatemore CPUs than there are physical CPUs, and that some CPUs are used to controlthe accelerators.@end table@node STARPU_NCUDA@subsection @code{STARPU_NCUDA} -- Number of CUDA workers@table @asis@item @emph{Description}:Specify the maximum number of CUDA devices that StarPU can use. If@code{STARPU_NCUDA} is lower than the number of physical devices, it ispossible to select which CUDA devices should be used by the means of the@code{STARPU_WORKERS_CUDAID} environment variable.@end table@node STARPU_NOPENCL@subsection @code{STARPU_NOPENCL} -- Number of OpenCL workers@table @asis@item @emph{Description}:OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.@end table@node STARPU_NGORDON@subsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)@table @asis@item @emph{Description}:Specify the maximum number of SPUs that StarPU can use.@end table@node STARPU_WORKERS_CPUID@subsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs@table @asis@item @emph{Description}:Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}specifies on which logical CPU the different workers should bebound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the firstworker will be bound to logical CPU #1, the second CPU worker will be bound tological CPU #3 and so on.  Note that the logical ordering of the CPUs is eitherdetermined by the OS, or provided by the @code{hwloc} library in case it isavailable.Note that the first workers correspond to the CUDA workers, then come theOpenCL and the SPU, and finally the CPU workers. For example ifwe have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlledby logical CPU #0, the OpenCL device will be controlled by logical CPU #2, andthe logical CPUs #1 and #3 will be used by the CPU workers.If the number of workers is larger than the array given in@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in around-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and thethird (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).@end table@node STARPU_WORKERS_CUDAID@subsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices@table @asis@item @emph{Description}:Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it ispossible to select which CUDA devices should be used by StarPU. On a machineequipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and thatthey should use CUDA devices #1 and #3 (the logical ordering of the devices isthe one reported by CUDA).@end table@node STARPU_WORKERS_OPENCLID@subsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices@table @asis@item @emph{Description}:OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.@end table@node Scheduling@section Configuring the Scheduling engine@menu* STARPU_SCHED     :: Scheduling policy* STARPU_CALIBRATE :: Calibrate performance models* STARPU_PREFETCH  :: Use data prefetch* STARPU_SCHED_ALPHA  :: Computation factor* STARPU_SCHED_BETA  :: Communication factor@end menu@node STARPU_SCHED@subsection @code{STARPU_SCHED} -- Scheduling policy@table @asis@item @emph{Description}:This chooses between the different scheduling policies proposed by StarPU: workrandom, stealing, greedy, with performance models, etc.Use @code{STARPU_SCHED=help} to get the list of available schedulers.@end table@node STARPU_CALIBRATE@subsection @code{STARPU_CALIBRATE} -- Calibrate performance models@table @asis@item @emph{Description}:If this variable is set to 1, the performance models are calibrated duringthe execution. If it is set to 2, the previous values are dropped to restartcalibration from scratch.Note: this currently only applies to dm and dmda scheduling policies.@end table@node STARPU_PREFETCH@subsection @code{STARPU_PREFETCH} -- Use data prefetch@table @asis@item @emph{Description}:If this variable is set, data prefetching will be enabled, that is when a task isscheduled to be executed e.g. on a GPU, StarPU will request an asynchronoustransfer in advance, so that data is already present on the GPU when the taskstarts. As a result, computation and data transfers are overlapped.@end table@node STARPU_SCHED_ALPHA@subsection @code{STARPU_SCHED_ALPHA} -- Computation factor@table @asis@item @emph{Description}:To estimate the cost of a task StarPU takes into account the estimatedcomputation time (obtained thanks to performance models). The alpha factor isthe coefficient to be applied to it before adding it to the communication part.@end table@node STARPU_SCHED_BETA@subsection @code{STARPU_SCHED_BETA} -- Communication factor@table @asis@item @emph{Description}:To estimate the cost of a task StarPU takes into account the estimateddata transfer time (obtained thanks to performance models). The beta factor isthe coefficient to be applied to it before adding it to the computation part.@end table@node Misc@section Miscellaneous and debug@menu* STARPU_LOGFILENAME  :: Select debug file name@end menu@node STARPU_LOGFILENAME@subsection @code{STARPU_LOGFILENAME} -- Select debug file name@table @asis@item @emph{Description}:This variable specify in which file the debugging output should be saved to.@end table@c ---------------------------------------------------------------------@c StarPU API@c ---------------------------------------------------------------------@node StarPU API@chapter StarPU API@menu* Initialization and Termination::   Initialization and Termination methods* Workers' Properties::              Methods to enumerate workers' properties* Data Library::                     Methods to manipulate data* Codelets and Tasks::               Methods to construct tasks* Tags::                             Task dependencies* CUDA extensions::                  CUDA extensions* OpenCL extensions::                OpenCL extensions* Cell extensions::                  Cell extensions* Miscellaneous::                    Miscellaneous helpers@end menu@node Initialization and Termination@section Initialization and Termination@menu* starpu_init::            Initialize StarPU* struct starpu_conf::     StarPU runtime configuration* starpu_shutdown::        Terminate StarPU@end menu@node starpu_init@subsection @code{starpu_init} -- Initialize StarPU@table @asis@item @emph{Description}:This is StarPU initialization method, which must be called prior to any otherStarPU call.  It is possible to specify StarPU's configuration (e.g. schedulingpolicy, number of cores, ...) by passing a non-null argument. Defaultconfiguration is used if the passed argument is @code{NULL}.@item @emph{Return value}:Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}indicates that no worker was available (so that StarPU was not initialized).@item @emph{Prototype}:@code{int starpu_init(struct starpu_conf *conf);}@end table@node struct starpu_conf@subsection @code{struct starpu_conf} -- StarPU runtime configuration@table @asis@item @emph{Description}:This structure is passed to the @code{starpu_init} function in orderto configure StarPU.When the default value is used, StarPU automatically selects the numberof processing units and takes the default scheduling policy. This parameteroverwrites the equivalent environment variables.@item @emph{Fields}:@table @asis @item @code{sched_policy} (default = NULL):This is the name of the scheduling policy. This can also be specified with the@code{STARPU_SCHED} environment variable.@item @code{ncpus} (default = -1):This is the maximum number of CPU cores that StarPU can use. This can also bespecified with the @code{STARPU_NCPUS} environment variable.@item @code{ncuda} (default = -1):This is the maximum number of CUDA devices that StarPU can use. This can also bespecified with the @code{STARPU_NCUDA} environment variable.@item @code{nopencl} (default = -1):This is the maximum number of OpenCL devices that StarPU can use. This can also bespecified with the @code{STARPU_NOPENCL} environment variable.@item @code{nspus} (default = -1):This is the maximum number of Cell SPUs that StarPU can use. This can also bespecified with the @code{STARPU_NGORDON} environment variable.@item @code{calibrate} (default = 0):If this flag is set, StarPU will calibrate the performance models whenexecuting tasks. This can also be specified with the @code{STARPU_CALIBRATE}environment variable.@end table@end table@node starpu_shutdown@subsection @code{starpu_shutdown} -- Terminate StarPU@table @asis@item @emph{Description}:This is StarPU termination method. It must be called at the end of theapplication: statistics and other post-mortem debugging information are notguaranteed to be available until this method has been called.@item @emph{Prototype}:@code{void starpu_shutdown(void);}@end table@node Workers' Properties@section Workers' Properties@menu* starpu_worker_get_count::        Get the number of processing units* starpu_cpu_worker_get_count::    Get the number of CPU controlled by StarPU* starpu_cuda_worker_get_count::   Get the number of CUDA devices controlled by StarPU* starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU* starpu_spu_worker_get_count::    Get the number of Cell SPUs controlled by StarPU* starpu_worker_get_id::           Get the identifier of the current worker* starpu_worker_get_type::         Get the type of processing unit associated to a worker* starpu_worker_get_name::         Get the name of a worker@end menu@node starpu_worker_get_count@subsection @code{starpu_worker_get_count} -- Get the number of processing units@table @asis@item @emph{Description}:This function returns the number of workers (i.e. processing units executingStarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.@item @emph{Prototype}:@code{unsigned starpu_worker_get_count(void);}@end table@node starpu_cpu_worker_get_count@subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU@table @asis@item @emph{Description}:This function returns the number of CPUs controlled by StarPU. The returnedvalue should be at most @code{STARPU_NMAXCPUS}.@item @emph{Prototype}:@code{unsigned starpu_cpu_worker_get_count(void);}@end table@node starpu_cuda_worker_get_count@subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU@table @asis@item @emph{Description}:This function returns the number of CUDA devices controlled by StarPU. The returnedvalue should be at most @code{STARPU_MAXCUDADEVS}.@item @emph{Prototype}:@code{unsigned starpu_cuda_worker_get_count(void);}@end table@node starpu_opencl_worker_get_count@subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU@table @asis@item @emph{Description}:This function returns the number of OpenCL devices controlled by StarPU. The returnedvalue should be at most @code{STARPU_MAXOPENCLDEVS}.@item @emph{Prototype}:@code{unsigned starpu_opencl_worker_get_count(void);}@end table@node starpu_spu_worker_get_count@subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU@table @asis@item @emph{Description}:This function returns the number of Cell SPUs controlled by StarPU.@item @emph{Prototype}:@code{unsigned starpu_opencl_worker_get_count(void);}@end table@node starpu_worker_get_id@subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker@table @asis@item @emph{Description}:This function returns the identifier of the worker associated to the callingthread. The returned value is either -1 if the current context is not a StarPUworker (i.e. when called from the application outside a task or a callback), oran integer between 0 and @code{starpu_worker_get_count() - 1}.@item @emph{Prototype}:@code{int starpu_worker_get_id(void);}@end table@node starpu_worker_get_type@subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker@table @asis@item @emph{Description}:This function returns the type of worker associated to an identifier (asreturned by the @code{starpu_worker_get_id} function). The returned valueindicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPUcore, @code{STARPU_CUDA_WORKER} for a CUDA device,@code{STARPU_OPENCL_WORKER} for a OpenCL device, and@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalididentifier is unspecified.@item @emph{Prototype}:@code{enum starpu_archtype starpu_worker_get_type(int id);}@end table@node starpu_worker_get_name@subsection @code{starpu_worker_get_name} -- Get the name of a worker@table @asis@item @emph{Description}:StarPU associates a unique human readable string to each processing unit. Thisfunction copies at most the @code{maxlen} first bytes of the unique stringassociated to a worker identified by its identifier @code{id} into the@code{dst} buffer. The caller is responsible for ensuring that the @code{dst}is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling thisfunction on an invalid identifier results in an unspecified behaviour.@item @emph{Prototype}:@code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}@end table@node Data Library@section Data LibraryThis section describes the data management facilities provided by StarPU.TODO: We show how to use existing data interfaces in [ref], but developers candesign their own data interfaces if required.@menu* starpu_data_handle::  StarPU opaque data handle* void *interface::     StarPU data interface@end menu@node starpu_data_handle@subsection @code{starpu_data_handle} -- StarPU opaque data handle@table @asis @item @emph{Description}:StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece ofdata. Once a piece of data has been registered to StarPU, it is associated to a@code{starpu_data_handle} which keeps track of the state of the piece of dataover the entire machine, so that we can maintain data consistency and locatedata replicates for instance.@end table@node void *interface@subsection @code{void *interface} -- StarPU data interface@table @asis @item @emph{Description}:Data management is done at a high-level in StarPU: rather than accessing a merelist of contiguous buffers, the tasks may manipulate data that are described bya high-level construct which we call data interface.TODO@end table@c void starpu_data_unregister(struct starpu_data_state_t *state);@c starpu_worker_get_memory_node TODO@c @c user interaction with the DSM@c   void starpu_data_sync_with_mem(struct starpu_data_state_t *state);@c   void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);@node Codelets and Tasks@section Codelets and Tasks@menu* struct starpu_codelet::         StarPU codelet structure* struct starpu_task::            StarPU task structure* starpu_task_init::              Initialize a Task* starpu_task_create::            Allocate and Initialize a Task* starpu_task_deinit::            Release all the resources used by a Task* starpu_task_destroy::           Destroy a dynamically allocated Task* starpu_task_submit::            Submit a Task* starpu_task_wait::              Wait for the termination of a Task* starpu_task_wait_for_all::	  Wait for the termination of all Tasks@end menu@node struct starpu_codelet@subsection @code{struct starpu_codelet} -- StarPU codelet structure@table @asis @item @emph{Description}:The codelet structure describes a kernel that is possibly implemented onvarious targets.@item @emph{Fields}:@table @asis@item @code{where}: Indicates which types of processing units are able to execute the codelet.@code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet isimplemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}indicates that it is only available on Cell SPUs.@item @code{cpu_func} (optional):Is a function pointer to the CPU implementation of the codelet. Its prototypemust be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The firstargument being the array of data managed by the data management library, andthe second argument is a pointer to the argument passed from the @code{cl_arg}field of the @code{starpu_task} structure.The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear inthe @code{where} field, it must be non-null otherwise.@item @code{cuda_func} (optional):Is a function pointer to the CUDA implementation of the codelet. @emph{Thismust be a host-function written in the CUDA runtime API}. Its prototype mustbe: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}field, it must be non-null otherwise.@item @code{opencl_func} (optional):Is a function pointer to the OpenCL implementation of the codelet. Itsprototype must be:@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.This pointer is ignored if @code{OPENCL} does not appear in the@code{where} field, it must be non-null otherwise.@item @code{gordon_func} (optional):This is the index of the Cell SPU implementation within the Gordon library.TODO@item @code{nbuffers}:Specifies the number of arguments taken by the codelet. These arguments aremanaged by the DSM and are accessed from the @code{void *buffers[]}array. The constant argument passed with the @code{cl_arg} field of the@code{starpu_task} structure is not counted in this number.  This value shouldnot be above @code{STARPU_NMAXBUFS}.@item @code{model} (optional):This is a pointer to the performance model associated to this codelet. Thisoptional field is ignored when null. TODO@end table@end table@node struct starpu_task@subsection @code{struct starpu_task} -- StarPU task structure@table @asis@item @emph{Description}:The @code{starpu_task} structure describes a task that can be offloaded on the variousprocessing units managed by StarPU. It instantiates a codelet. It can either beallocated dynamically with the @code{starpu_task_create} method, or declaredstatically. In the latter case, the programmer has to zero the@code{starpu_task} structure and to fill the different fields properly. Theindicated default values correspond to the configuration of a task allocatedwith @code{starpu_task_create}.@item @emph{Fields}:@table @asis@item @code{cl}:Is a pointer to the corresponding @code{starpu_codelet} data structure. Thisdescribes where the kernel should be executed, and supplies the appropriateimplementations. When set to @code{NULL}, no code is executed during the tasks,such empty tasks can be useful for synchronization purposes. @item @code{buffers}:TODO@item @code{cl_arg} (optional) (default = NULL):This pointer is passed to the codelet through the second argumentof the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).In the specific case of the Cell processor, see the @code{cl_arg_size}argument.@item @code{cl_arg_size} (optional, Cell specific):In the case of the Cell processor, the @code{cl_arg} pointer is not directlygiven to the SPU function. A buffer of size @code{cl_arg_size} is allocated onthe SPU. This buffer is then filled with the @code{cl_arg_size} bytes startingat address @code{cl_arg}. In this case, the argument given to the SPU codeletis therefore not the @code{cl_arg} pointer, but the address of the buffer inlocal store (LS) instead. This field is ignored for CPU, CUDA and OpenCLcodelets. @item @code{callback_func} (optional) (default = @code{NULL}):This is a function pointer of prototype @code{void (*f)(void *)} whichspecifies a possible callback. If this pointer is non-null, the callbackfunction is executed @emph{on the host} after the execution of the task. Thecallback is passed the value contained in the @code{callback_arg} field. Nocallback is executed if the field is null.@item @code{callback_arg} (optional) (default = @code{NULL}):This is the pointer passed to the callback function. This field is ignored ifthe @code{callback_func} is null.@item @code{use_tag} (optional) (default = 0):If set, this flag indicates that the task should be associated with the tagcontained in the @code{tag_id} field. Tag allow the application to synchronizewith the task and to express task dependencies easily.@item @code{tag_id}:This fields contains the tag associated to the task if the @code{use_tag} fieldwas set, it is ignored otherwise.@item @code{synchronous}:If this flag is set, the @code{starpu_task_submit} function is blocking andreturns only when the task has been executed (or if no worker is able toprocess the task). Otherwise, @code{starpu_task_submit} returns immediately.@item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):This field indicates a level of priority for the task. This is an integer valuethat must be set between @code{STARPU_MIN_PRIO} (for the least importanttasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.Default priority is @code{STARPU_DEFAULT_PRIO}.  Scheduling strategies thattake priorities into account can use this parameter to take better schedulingdecisions, but the scheduling policy may also ignore it.@item @code{execute_on_a_specific_worker} (default = 0):If this flag is set, StarPU will bypass the scheduler and directly affect thistask to the worker specified by the @code{workerid} field.@item @code{workerid} (optional):If the @code{execute_on_a_specific_worker} field is set, this field indicateswhich is the identifier of the worker that should process this task (asreturned by @code{starpu_worker_get_id}). This field is ignored if@code{execute_on_a_specific_worker} field is set to 0.@item @code{detach} (optional) (default = 1):If this flag is set, it is not possible to synchronize with the taskby the means of @code{starpu_task_wait} later on. Internal data structuresare only guaranteed to be freed once @code{starpu_task_wait} is called if theflag is not set.@item @code{destroy} (optional) (default = 1):If this flag is set, the task structure will automatically be freed, eitherafter the execution of the callback if the task is detached, or during@code{starpu_task_wait} otherwise. If this flag is not set, dynamicallyallocated data structures will not be freed until @code{starpu_task_destroy} iscalled explicitly. Setting this flag for a statically allocated task structurewill result in undefined behaviour.@end table@end table@node starpu_task_init@subsection @code{starpu_task_init} -- Initialize a Task@table @asis@item @emph{Description}:Initialize a task structure with default values. This function is implicitlycalled by @code{starpu_task_create}. By default, tasks initialized with@code{starpu_task_init} must be deinitialized explicitly with@code{starpu_task_deinit}. Tasks can also be initialized statically, using theconstant @code{STARPU_TASK_INITIALIZER}.@item @emph{Prototype}:@code{void starpu_task_init(struct starpu_task *task);}@end table@node starpu_task_create@subsection @code{starpu_task_create} -- Allocate and Initialize a Task@table @asis@item @emph{Description}:Allocate a task structure and initialize it with default values. Tasksallocated dynamically with @code{starpu_task_create} are automatically freed when thetask is terminated. If the destroy flag is explicitly unset, the resources usedby the task are freed by calling@code{starpu_task_destroy}.@item @emph{Prototype}:@code{struct starpu_task *starpu_task_create(void);}@end table@node starpu_task_deinit@subsection @code{starpu_task_deinit} -- Release all the resources used by a Task@table @asis@item @emph{Description}:Release all the structures automatically allocated to execute the task. This iscalled automatically by @code{starpu_task_destroy}, but the task structure itself is notfreed. This should be used for statically allocated tasks for instance.@item @emph{Prototype}:@code{void starpu_task_deinit(struct starpu_task *task);}@end table@node starpu_task_destroy@subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task@table @asis@item @emph{Description}:Free the resource allocated during @code{starpu_task_create}. This function can becalled automatically after the execution of a task by setting the@code{destroy} flag of the @code{starpu_task} structure (default behaviour).Calling this function on a statically allocated task results in an undefinedbehaviour.@item @emph{Prototype}:@code{void starpu_task_destroy(struct starpu_task *task);}@end table@node starpu_task_wait@subsection @code{starpu_task_wait} -- Wait for the termination of a Task@table @asis@item @emph{Description}:This function blocks until the task has been executed. It is not possible tosynchronize with a task more than once. It is not possible to wait forsynchronous or detached tasks.@item @emph{Return value}:Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}indicates that the specified task was either synchronous or detached.@item @emph{Prototype}:@code{int starpu_task_wait(struct starpu_task *task);}@end table@node starpu_task_submit@subsection @code{starpu_task_submit} -- Submit a Task@table @asis@item @emph{Description}:This function submits a task to StarPU. Calling this function doesnot mean that the task will be executed immediately as there can be data or task(tag) dependencies that are not fulfilled yet: StarPU will take care ofscheduling this task with respect to such dependencies.This function returns immediately if the @code{synchronous} field of the@code{starpu_task} structure was set to 0, and block until the termination ofthe task otherwise. It is also possible to synchronize the application withasynchronous tasks by the means of tags, using the @code{starpu_tag_wait}function for instance.@item @emph{Return value}:In case of success, this function returns 0, a return value of @code{-ENODEV}means that there is no worker able to process this task (e.g. there is no GPUavailable and this task is only implemented for CUDA devices).@item @emph{Prototype}:@code{int starpu_task_submit(struct starpu_task *task);}@end table@node starpu_task_wait_for_all@subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks@table @asis@item @emph{Description}:This function blocks until all the tasks that were submitted are terminated.@item @emph{Prototype}:@code{void starpu_task_wait_for_all(void);}@end table@c Callbacks : what can we put in callbacks ?@node Tags@section Tags@menu* starpu_tag_t::                   Task identifier* starpu_tag_declare_deps::        Declare the Dependencies of a Tag* starpu_tag_declare_deps_array::  Declare the Dependencies of a Tag* starpu_tag_wait::                Block until a Tag is terminated* starpu_tag_wait_array::          Block until a set of Tags is terminated* starpu_tag_remove::              Destroy a Tag* starpu_tag_notify_from_apps::    Feed a tag explicitly@end menu@node starpu_tag_t@subsection @code{starpu_tag_t} -- Task identifier@table @asis@item @emph{Description}:It is possible to associate a task with a unique ``tag'' and to expressdependencies between tasks by the means of those tags. To do so, fill the@code{tag_id} field of the @code{starpu_task} structure with a tag number (canbe arbitrary) and set the @code{use_tag} field to 1.If @code{starpu_tag_declare_deps} is called with this tag number, the task willnot be started until the tasks which holds the declared dependency tags arecompleted.@end table@node starpu_tag_declare_deps@subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag@table @asis@item @emph{Description}:Specify the dependencies of the task identified by tag @code{id}. The firstargument specifies the tag which is configured, the second argument gives thenumber of tag(s) on which @code{id} depends. The following arguments are thetags which have to be terminated to unlock the task.This function must be called before the associated task is submitted to StarPUwith @code{starpu_task_submit}.@item @emph{Remark}Because of the variable arity of @code{starpu_tag_declare_deps}, note that thelast arguments @emph{must} be of type @code{starpu_tag_t}: constant valuestypically need to be explicitly casted. Using the@code{starpu_tag_declare_deps_array} function avoids this hazard.@item @emph{Prototype}:@code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}@item @emph{Example}:@cartouche@example/*  Tag 0x1 depends on tags 0x32 and 0x52 */starpu_tag_declare_deps((starpu_tag_t)0x1,        2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);@end example@end cartouche@end table@node starpu_tag_declare_deps_array@subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag@table @asis@item @emph{Description}:This function is similar to @code{starpu_tag_declare_deps}, except that itsdoes not take a variable number of arguments but an array of tags of size@code{ndeps}.@item @emph{Prototype}:@code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}@item @emph{Example}:@cartouche@example/*  Tag 0x1 depends on tags 0x32 and 0x52 */starpu_tag_t tag_array[2] = @{0x32, 0x52@};starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);@end example@end cartouche@end table@node starpu_tag_wait@subsection @code{starpu_tag_wait} -- Block until a Tag is terminated@table @asis@item @emph{Description}:This function blocks until the task associated to tag @code{id} has beenexecuted. This is a blocking call which must therefore not be called withintasks or callbacks, but only from the application directly.  It is possible tosynchronize with the same tag multiple times, as long as the@code{starpu_tag_remove} function is not called.  Note that it is stillpossible to synchronize with a tag associated to a task which @code{starpu_task}data structure was freed (e.g. if the @code{destroy} flag of the@code{starpu_task} was enabled).@item @emph{Prototype}:@code{void starpu_tag_wait(starpu_tag_t id);}@end table@node starpu_tag_wait_array@subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated@table @asis@item @emph{Description}:This function is similar to @code{starpu_tag_wait} except that it blocks until@emph{all} the @code{ntags} tags contained in the @code{id} array areterminated.@item @emph{Prototype}:@code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}@end table@node starpu_tag_remove@subsection @code{starpu_tag_remove} -- Destroy a Tag@table @asis@item @emph{Description}:This function releases the resources associated to tag @code{id}. It can becalled once the corresponding task has been executed and when there isno other tag that depend on this tag anymore.@item @emph{Prototype}:@code{void starpu_tag_remove(starpu_tag_t id);}@end table@node starpu_tag_notify_from_apps@subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly@table @asis@item @emph{Description}:This function explicitly unlocks tag @code{id}. It may be useful in thecase of applications which execute part of their computation outside StarPUtasks (e.g. third-party libraries).  It is also provided as aconvenient tool for the programmer, for instance to entirely construct the taskDAG before actually giving StarPU the opportunity to execute the tasks.@item @emph{Prototype}:@code{void starpu_tag_notify_from_apps(starpu_tag_t id);}@end table@node CUDA extensions@section CUDA extensions@c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);@c starpu_helper_cublas_init TODO@c starpu_helper_cublas_shutdown TODO@menu* starpu_cuda_get_local_stream::   Get current worker's CUDA stream* starpu_helper_cublas_init::      Initialize CUBLAS on every CUDA device* starpu_helper_cublas_shutdown::  Deinitialize CUBLAS on every CUDA device@end menu@node starpu_cuda_get_local_stream@subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream@table @asis@item @emph{Description}:StarPU provides a stream for every CUDA device controlled by StarPU. Thisfunction is only provided for convenience so that programmers can easily useasynchronous operations within codelets without having to create a stream byhand. Note that the application is not forced to use the stream provided by@code{starpu_cuda_get_local_stream} and may also create its own streams.@item @emph{Prototype}:@code{cudaStream_t *starpu_cuda_get_local_stream(void);}@end table@node starpu_helper_cublas_init@subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device@table @asis@item @emph{Description}:The CUBLAS library must be initialized prior to any CUBLAS call. Calling@code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA devicecontrolled by StarPU. This call blocks until CUBLAS has been properlyinitialized on every device.@item @emph{Prototype}:@code{void starpu_helper_cublas_init(void);}@end table@node starpu_helper_cublas_shutdown@subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device@table @asis@item @emph{Description}:This function synchronously deinitializes the CUBLAS library on every CUDA device.@item @emph{Prototype}:@code{void starpu_helper_cublas_shutdown(void);}@end table@node OpenCL extensions@section OpenCL extensions@menu* Enabling OpenCL::                 Enabling OpenCL* Compiling OpenCL codelets::       Compiling OpenCL codelets@end menu@node Enabling OpenCL@subsection Enabling OpenCLOn GPU devices which can run both CUDA and OpenCL, CUDA will beenabled by default. To enable OpenCL, you need either to disable CUDAwhen configuring StarPU:@example$ ./configure --disable-cuda@end exampleor when running applications:@example$ STARPU_NCUDA=0 ./application@end exampleOpenCL will automatically be started on any device not yet used byCUDA. So on a machine running 4 GPUS, it is therefore possible toenable CUDA on 2 devices, and OpenCL on the 2 other devices by doingso:@example$ STARPU_NCUDA=2 ./application@end example@node Compiling OpenCL codelets@subsection Compiling OpenCL codeletsTODO@node Cell extensions@section Cell extensionsnothing yet.@node Miscellaneous@section Miscellaneous helpers@menu* starpu_execute_on_each_worker::   Execute a function on a subset of workers@end menu@node starpu_execute_on_each_worker@subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers@table @asis@item @emph{Description}:When calling this method, the offloaded function specified by the first argument isexecuted by every StarPU worker that may execute the function.The second argument is passed to the offloaded function.The last argument specifies on which types of processing units the functionshould be executed. Similarly to the @code{where} field of the@code{starpu_codelet} structure, it is possible to specify that the functionshould be executed on every CUDA device and every CPU by passing@code{STARPU_CPU|STARPU_CUDA}.This function blocks until the function has been executed on every appropriateprocessing units, so that it may not be called from a callback function forinstance.@item @emph{Prototype}:@code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}@end table@c ---------------------------------------------------------------------@c Basic Examples@c ---------------------------------------------------------------------@node Basic Examples@chapter Basic Examples@menu* Compiling and linking::        Compiling and Linking Options* Hello World::                  Submitting Tasks* Scaling a Vector::             Manipulating Data* Scaling a Vector (hybrid)::    Handling Heterogeneous Architectures@end menu@node Compiling and linking@section Compiling and linking optionsThe Makefile could for instance contain the following lines to define whichoptions must be given to the compiler and to the linker:@cartouche@exampleCFLAGS+=$$(pkg-config --cflags libstarpu)LIBS+=$$(pkg-config --libs libstarpu)@end example@end cartouche@node Hello World@section Hello WorldIn this section, we show how to implement a simple program that submits a task to StarPU.@subsection Required HeadersThe @code{starpu.h} header should be included in any code using StarPU.@cartouche@example#include <starpu.h>@end example@end cartouche@subsection Defining a Codelet@cartouche@examplevoid cpu_func(void *buffers[], void *cl_arg)@{    float *array = cl_arg;    printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);@}starpu_codelet cl =@{    .where = STARPU_CPU,    .cpu_func = cpu_func,    .nbuffers = 0@};@end example@end cartoucheA codelet is a structure that represents a computational kernel. Such a codeletmay contain an implementation of the same kernel on different architectures(e.g. CUDA, Cell's SPU, x86, ...).The @code{nbuffers} field specifies the number of data buffers that aremanipulated by the codelet: here the codelet does not access or modify any datathat is controlled by our data management library. Note that the argumentpassed to the codelet (the @code{cl_arg} field of the @code{starpu_task}structure) does not count as a buffer since it is not managed by our datamanagement library. @c TODO need a crossref to the proper description of "where" see bla for more ...We create a codelet which may only be executed on the CPUs. The @code{where}field is a bitmask that defines where the codelet may be executed. Here, the@code{STARPU_CPU} value means that only CPUs can execute this codelet(@pxref{Codelets and Tasks} for more details on this field).When a CPU core executes a codelet, it calls the @code{cpu_func} function,which @emph{must} have the following prototype:@code{void (*cpu_func)(void *buffers[], void *cl_arg)}In this example, we can ignore the first argument of this function which gives adescription of the input and output buffers (e.g. the size and the location ofthe matrices). The second argument is a pointer to a buffer passed as anargument to the codelet by the means of the @code{cl_arg} field of the@code{starpu_task} structure.@c TODO rewrite so that it is a little clearer ?Be aware that this may be a pointer to a@emph{copy} of the actual buffer, and not the pointer given by the programmer:if the codelet modifies this buffer, there is no guarantee that the initialbuffer will be modified as well: this for instance implies that the buffercannot be used as a synchronization medium.@subsection Submitting a Task@cartouche@examplevoid callback_func(void *callback_arg)@{    printf("Callback function (arg %x)\n", callback_arg);@}int main(int argc, char **argv)@{    /* initialize StarPU */    starpu_init(NULL);    struct starpu_task *task = starpu_task_create();    task->cl = &cl;    float *array[2] = @{1.0f, -1.0f@};    task->cl_arg = &array;    task->cl_arg_size = 2*sizeof(float);    task->callback_func = callback_func;    task->callback_arg = 0x42;    /* starpu_task_submit will be a blocking call */    task->synchronous = 1;    /* submit the task to StarPU */    starpu_task_submit(task);    /* terminate StarPU */    starpu_shutdown();    return 0;@}@end example@end cartoucheBefore submitting any tasks to StarPU, @code{starpu_init} must be called. The@code{NULL} argument specifies that we use default configuration. Tasks cannotbe submitted after the termination of StarPU by a call to@code{starpu_shutdown}.In the example above, a task structure is allocated by a call to@code{starpu_task_create}. This function only allocates and fills thecorresponding structure with the default settings (@pxref{starpu_task_create}),but it does not submit the task to StarPU.@c not really clear ;)The @code{cl} field is a pointer to the codelet which the task willexecute: in other words, the codelet structure describes which computationalkernel should be offloaded on the different architectures, and the taskstructure is a wrapper containing a codelet and the piece of data on which thecodelet should operate.The optional @code{cl_arg} field is a pointer to a buffer (of size@code{cl_arg_size}) with some parameters for the kerneldescribed by the codelet. For instance, if a codelet implements a computationalkernel that multiplies its input vector by a constant, the constant could bespecified by the means of this buffer.Once a task has been executed, an optional callback function can be called.While the computational kernel could be offloaded on various architectures, thecallback function is always executed on a CPU. The @code{callback_arg}pointer is passed as an argument of the callback. The prototype of a callbackfunction must be:@cartouche@examplevoid (*callback_function)(void *);@end example@end cartoucheIf the @code{synchronous} field is non-null, task submission will besynchronous: the @code{starpu_task_submit} function will not return until thetask was executed. Note that the @code{starpu_shutdown} method does notguarantee that asynchronous tasks have been executed before it returns.@node Scaling a Vector@section Manipulating Data: Scaling a VectorThe previous example has shown how to submit tasks. In this section we show howStarPU tasks can manipulate data.Programmers can describe the data layout of their application so that StarPU isresponsible for enforcing data coherency and availability across the machine.Instead of handling complex (and non-portable) mechanisms to perform datamovements, programmers only declare which piece of data is accessed and/ormodified by a task, and StarPU makes sure that when a computational kernelstarts somewhere (e.g. on a GPU), its data are available locally.Before submitting those tasks, the programmer first needs to declare thedifferent pieces of data to StarPU using the @code{starpu_*_data_register}functions. To ease the development of applications for StarPU, it is possibleto describe multiple types of data layout. A type of data layout is called an@b{interface}. By default, there are different interfaces available in StarPU:here we will consider the @b{vector interface}.The following lines show how to declare an array of @code{n} elements of type@code{float} using the vector interface:@cartouche@examplefloat tab[n];starpu_data_handle tab_handle;starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));@end example@end cartoucheThe first argument, called the @b{data handle}, is an opaque pointer whichdesignates the array in StarPU. This is also the structure which is used todescribe which data is used by a task. The second argument is the node numberwhere the data currently resides. Here it is 0 since the @code{tab} array is inthe main memory. Then comes the pointer @code{tab} where the data can be found,the number of elements in the vector and the size of each element.It is possible to construct a StarPUtask that multiplies this vector by a constant factor:@cartouche@examplefloat factor = 3.0;struct starpu_task *task = starpu_task_create();task->cl = &cl;task->buffers[0].handle = tab_handle;task->buffers[0].mode = STARPU_RW;task->cl_arg = &factor;task->cl_arg_size = sizeof(float);task->synchronous = 1;starpu_task_submit(task);@end example@end cartoucheSince the factor is constant, it does not need a preliminary declaration, andcan just be passed through the @code{cl_arg} pointer like in the previousexample.  The vector parameter is described by its handle.There are two fields in each element of the @code{buffers} array.@code{handle} is the handle of the data, and @code{mode} specifies how thekernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} forwrite-only and @code{STARPU_RW} for read and write access).The definition of the codelet can be written as follows:@cartouche@examplevoid scal_func(void *buffers[], void *cl_arg)@{    unsigned i;    float *factor = cl_arg;    struct starpu_vector_interface_s *vector = buffers[0];    /* length of the vector */    unsigned n = STARPU_GET_VECTOR_NX(vector);    /* local copy of the vector pointer */    float *val = (float *)STARPU_GET_VECTOR_PTR(vector);    for (i = 0; i < n; i++)        val[i] *= *factor;@}starpu_codelet cl = @{    .where = STARPU_CPU,    .cpu_func = scal_func,    .nbuffers = 1@};@end example@end cartoucheThe second argument of the @code{scal_func} function contains a pointer to theparameters of the codelet (given in @code{task->cl_arg}), so that we read theconstant factor from this pointer. The first argument is an array that givesa description of every buffers passed in the @code{task->buffers}@ array. Thesize of this array is given by the @code{nbuffers} field of the codeletstructure. For the sake of generality, this array contains pointers to thedifferent interfaces describing each buffer.  In the case of the @b{vectorinterface}, the location of the vector (resp. its length) is accessible in the@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in aread-write fashion, any modification will automatically affect future accessesto this vector made by other tasks.@node Scaling a Vector (hybrid)@section Vector Scaling on an Hybrid CPU/GPU MachineContrary to the previous examples, the task submitted in this example may notonly be executed by the CPUs, but also by a CUDA device.@menu* Source code::                  Source of the StarPU application* Compilation and execution::    Executing the StarPU application@end menu@node Source code@subsection Source codeThe CUDA implementation can be written as follows. It needs to becompiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compilerdriver.@cartouche@example#include <starpu.h>static __global__ void vector_mult_cuda(float *val, unsigned n,                                        float factor)@{        unsigned i;        for(i = 0 ; i < n ; i++)               val[i] *= factor;@}extern "C" void scal_cuda_func(void *buffers[], void *_args)@{        float *factor = (float *)_args;        struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];        /* length of the vector */        unsigned n = STARPU_GET_VECTOR_NX(vector);        /* local copy of the vector pointer */        float *val = (float *)STARPU_GET_VECTOR_PTR(vector);        /* TODO: use more blocks and threads in blocks */        vector_mult_cuda<<<1,1>>>(val, n, *factor);	cudaThreadSynchronize();@}@end example@end cartoucheThe CPU implementation is the same as in the previous section.Here is the source of the main application. You can notice the value of thefield @code{where} for the codelet. We specify@code{STARPU_CPU|STARPU_CUDA} to indicate to StarPU that the codeletcan be executed either on a CPU or on a CUDA device.@cartouche@example#include <starpu.h>#define NX 5extern void scal_cuda_func(void *buffers[], void *_args);extern void scal_func(void *buffers[], void *_args);/* @b{Definition of the codelet} */static starpu_codelet cl = @{	.where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU} */	                                 /* @b{or on a CUDA device} */	.cuda_func = scal_cuda_func;	.cpu_func = scal_func;	.nbuffers = 1;@}int main(int argc, char **argv)@{        float *vector;        int i, ret;        float factor=3.0;        struct starpu_task *task;        starpu_data_handle tab_handle;        starpu_init(NULL);                            /* @b{Initialising StarPU} */        vector = (float*)malloc(NX*sizeof(float));        assert(vector);        for(i=0 ; i<NX ; i++) vector[i] = i;@end example@end cartouche@cartouche@example        /* @b{Registering data within StarPU} */        starpu_vector_data_register(&tab_handle, 0, (uintptr_t)vector,                                    NX, sizeof(float));        /* @b{Definition of the task} */        task = starpu_task_create();        task->cl = &cl;        task->callback_func = NULL;        task->buffers[0].handle = tab_handle;        task->buffers[0].mode = STARPU_RW;        task->cl_arg = &factor;@end example@end cartouche@cartouche@example        /* @b{Submitting the task} */        ret = starpu_task_submit(task);        if (ret == -ENODEV) @{                fprintf(stderr, "No worker may execute this task\n");                return 1;        @}        /* @b{Waiting for its termination} */        starpu_task_wait_for_all();        /* @b{Update the vector in RAM} */        starpu_data_sync_with_mem(tab_handle, STARPU_R);@end example@end cartouche@cartouche@example        /* @b{Access the data} */        for(i=0 ; i<NX; i++) @{          fprintf(stderr, "%f ", vector[i]);        @}        fprintf(stderr, "\n");        /* @b{Release the data and shutdown StarPU} */        starpu_data_release_from_mem(tab_handle);        starpu_shutdown();        return 0;@}@end example@end cartouche@node Compilation and execution@subsection Compilation and executionLet's suppose StarPU has been installed in the directory@code{$STARPU_DIR}. As explained in @ref{pkg-config configuration},the variable @code{PKG_CONFIG_PATH} needs to be set. It is alsonecessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamiclibraries at runtime.@example$ PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH$ LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH@end exampleIt is then possible to compile the application using the followingmakefile:@cartouche@exampleCFLAGS	+=	$(shell pkg-config --cflags libstarpu)LDFLAGS	+=	$(shell pkg-config --libs libstarpu)CC	=	gccvector: vector.o vector_cpu.o vector_cuda.o%.o: %.cu       nvcc $(CFLAGS) $< -c $@clean:       rm -f vector *.o@end example@end cartouche@example$ make@end exampleand to execute it, with the default configuration:@example$ ./vector0.000000 3.000000 6.000000 9.000000 12.000000@end exampleor for example, by disabling CPU devices:@example$ STARPU_NCPUS=0 ./vector0.000000 3.000000 6.000000 9.000000 12.000000@end exampleor by disabling CUDA devices:@example$ STARPU_NCUDA=0 ./vector0.000000 3.000000 6.000000 9.000000 12.000000@end example@c TODO: Add performance model example (and update basic_examples)@c ---------------------------------------------------------------------@c Advanced Topics@c ---------------------------------------------------------------------@node Advanced Topics@chapter Advanced Topics@bye
 |