| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338 | \input texinfo @c -*-texinfo-*-@c %**start of header@setfilename starpu.info@settitle StarPU Handbook@c %**end of header@include version.texi@setchapternewpage odd@titlepage@title StarPU Handbook@subtitle for StarPU @value{VERSION}@page@vskip 0pt plus 1fill@comment For the @value{version-GCC} Version*@end titlepage@c @summarycontents@contents@page@node Top@top Preface@cindex PrefaceThis manual documents the usage of StarPU version @value{VERSION}.  Itwas last updated on @value{UPDATED}.@comment@comment  When you add a new menu item, please keep the right hand@comment  aligned to the same column.  Do not use tabs.  This provides@comment  better formatting.@comment@menu* Introduction::                A basic introduction to using StarPU* Installing StarPU::           How to configure, build and install StarPU* Using StarPU::                How to run StarPU application* Basic Examples::              Basic examples of the use of StarPU* Performance optimization::    How to optimize performance with StarPU* Performance feedback::        Performance debugging tools* StarPU MPI support::          How to combine StarPU with MPI* Configuring StarPU::          How to configure StarPU* StarPU API::                  The API to use StarPU* Advanced Topics::             Advanced use of StarPU* Full source code for the 'Scaling a Vector' example::  * Function Index::              Index of C functions.@end menu@c ---------------------------------------------------------------------@c Introduction to StarPU@c ---------------------------------------------------------------------@node Introduction@chapter Introduction to StarPU@menu* Motivation::                  Why StarPU ?* StarPU in a Nutshell::        The Fundamentals of StarPU@end menu@node Motivation@section Motivation@c complex machines with heterogeneous cores/devicesThe use of specialized hardware such as accelerators or coprocessors offers aninteresting approach to overcome the physical limits encountered by processorarchitects. As a result, many machines are now equipped with one or severalaccelerators (e.g. a GPU), in addition to the usual processor(s). While a lot ofefforts have been devoted to offload computation onto such accelerators, verylittle attention as been paid to portability concerns on the one hand, and to thepossibility of having heterogeneous accelerators and processors to interact on the other hand.StarPU is a runtime system that offers support for heterogeneous multicorearchitectures, it not only offers a unified view of the computational resources(i.e. CPUs and accelerators at the same time), but it also takes care ofefficiently mapping and executing tasks onto an heterogeneous machine whiletransparently handling low-level issues such as data transfers in a portablefashion.@c this leads to a complicated distributed memory design@c which is not (easily) manageable by hand@c added value/benefits of StarPU@c   - portability@c   - scheduling, perf. portability@node StarPU in a Nutshell@section StarPU in a Nutshell@menu* Codelet and Tasks::           * StarPU Data Management Library::  * Research Papers::@end menuFrom a programming point of view, StarPU is not a new language but a librarythat executes tasks explicitly submitted by the application.  The data that atask manipulates are automatically transferred onto the accelerator so that theprogrammer does not have to take care of complex data movements.  StarPU alsotakes particular care of scheduling those tasks efficiently and allowsscheduling experts to implement custom scheduling policies in a portablefashion.@c explain the notion of codelet and task (i.e. g(A, B)@node Codelet and Tasks@subsection Codelet and TasksOne of the StarPU primary data structures is the @b{codelet}. A codelet describes acomputational kernel that can possibly be implemented on multiple architecturessuch as a CPU, a CUDA device or a Cell's SPU.@c TODO insert illustration f : f_spu, f_cpu, ...Another important data structure is the @b{task}. Executing a StarPU taskconsists in applying a codelet on a data set, on one of the architectures onwhich the codelet is implemented. In addition to the codelet that a taskuseuses, it also describes which data are accessed, and how they areaccessed during the computation (read and/or write).StarPU tasks are asynchronous: submitting a task to StarPU is a non-blockingoperation. The task structure can also specify a @b{callback} function that iscalled once StarPU has properly executed the task. It also contains optionalfields that the application may use to give hints to the scheduler (such aspriority levels).A task may be identified by a unique 64-bit number chosen by the applicationwhich we refer as a @b{tag}.Task dependencies can be enforced either by the means of callback functions, byexpressing dependencies between explicit tasks or by expressing dependenciesbetween tags (which can thus correspond to tasks that have not been submittedyet).@c TODO insert illustration f(Ar, Brw, Cr) + ..@c DSM@node StarPU Data Management Library@subsection StarPU Data Management LibraryBecause StarPU schedules tasks at runtime, data transfers have to bedone automatically and ``just-in-time'' between processing units,relieving the application programmer from explicit data transfers.Moreover, to avoid unnecessary transfers, StarPU keeps datawhere it was last needed, even if was modified there, and itallows multiple copies of the same data to reside at the same time onseveral processing units as long as it is not modified.@node Research Papers@subsection Research PapersResearch papers about StarPU can be found at@indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}Notably a good overview in the research report@indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}@c ---------------------------------------------------------------------@c Installing StarPU@c ---------------------------------------------------------------------@node Installing StarPU@chapter Installing StarPU@menu* Downloading StarPU::          * Configuration of StarPU::     * Building and Installing StarPU::  @end menuStarPU can be built and installed by the standard means of the GNUautotools. The following chapter is intended to briefly remind how these toolscan be used to install StarPU.@node Downloading StarPU@section Downloading StarPU@menu* Getting Sources::             * Optional dependencies::       @end menu@node Getting Sources@subsection Getting SourcesThe simplest way to get StarPU sources is to download the latest officialrelease tarball from @indicateurl{https://gforge.inria.fr/frs/?group_id=1570} ,or the latest nightly snapshot from@indicateurl{http://starpu.gforge.inria.fr/testing/} . The following documentshow to get the very latest version from the subversion repository itself, itshould be needed only if you need the very latest changes (i.e. less than aday!)The source code is managed by a Subversion server hosted by theInriaGforge. To get the source code, you need:@itemize@itemTo install the client side of the software Subversion if it isnot already available on your system. The software can be obtained from@indicateurl{http://subversion.tigris.org} . If you are runningon Windows, you will probably prefer to use TortoiseSVN from@indicateurl{http://tortoisesvn.tigris.org/} .@itemYou can check out the project's SVN repository through anonymousaccess. This will provide you with a read access to therepository.If you need to have write access on the StarPU project, you can also choose tobecome a member of the project @code{starpu}.  For this, you first need to getan account to the gForge server. You can then send a request to join the project(@indicateurl{https://gforge.inria.fr/project/request.php?group_id=1570}).@itemMore information on how to get a gForge account, to become a member ofa project, or on any other related task can be obtained from theInriaGforge at @indicateurl{https://gforge.inria.fr/}. The most importantthing is to upload your public SSH key on the gForge server (see theFAQ at @indicateurl{http://siteadmin.gforge.inria.fr/FAQ.html#Q6} forinstructions).@end itemizeYou can now check out the latest version from the Subversion server:@itemize@itemusing the anonymous access via svn:@example% svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk@end example@itemusing the anonymous access via https:@example% svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk@end exampleThe password is @code{anonsvn}.@itemusing your gForge account@example% svn checkout svn+ssh://<login>@@scm.gforge.inria.fr/svn/starpu/trunk@end example@end itemizeThe following step requires the availability of @code{autoconf} and@code{automake} to generate the @code{./configure} script. This isdone by calling @code{./autogen.sh}. The required version for@code{autoconf} is 2.60 or higher. You will also need @code{makeinfo}.@example% ./autogen.sh@end exampleIf the autotools are not available on your machine or not recentenough, you can choose to download the latest nightly tarball, whichis provided with a @code{configure} script.@example% wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz@end example@node Optional dependencies@subsection Optional dependenciesThe topology discovery library, @code{hwloc}, is not mandatory to use StarPUbut strongly recommended. It allows to increase performance, and toperform some topology aware scheduling.@code{hwloc} is available in major distributions and for most OSes and can bedownloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.@node Configuration of StarPU@section Configuration of StarPU@menu* Generating Makefiles and configuration scripts::  * Running the configuration::   @end menu@node Generating Makefiles and configuration scripts@subsection Generating Makefiles and configuration scriptsThis step is not necessary when using the tarball releases of StarPU.  If youare using the source code from the svn repository, you first need to generatethe configure scripts and the Makefiles.@example% ./autogen.sh@end example@node Running the configuration@subsection Running the configuration@example% ./configure@end exampleDetails about options that are useful to give to @code{./configure} are given in@ref{Compilation configuration}.@node Building and Installing StarPU@section Building and Installing StarPU@menu* Building::                    * Sanity Checks::               * Installing::                  @end menu@node Building@subsection Building@example% make@end example@node Sanity Checks@subsection Sanity ChecksIn order to make sure that StarPU is working properly on the system, it is alsopossible to run a test suite.@example% make check@end example@node Installing@subsection InstallingIn order to install StarPU at the location that was specified duringconfiguration:@example% make install@end example@c ---------------------------------------------------------------------@c Using StarPU@c ---------------------------------------------------------------------@node Using StarPU@chapter Using StarPU@menu* Setting flags for compiling and linking applications::  * Running a basic StarPU application::  * Kernel threads started by StarPU::* Using accelerators::          @end menu@node Setting flags for compiling and linking applications@section Setting flags for compiling and linking applicationsCompiling and linking an application against StarPU may require to usespecific flags or libraries (for instance @code{CUDA} or @code{libspe2}).To this end, it is possible to use the @code{pkg-config} tool.If StarPU was not installed at some standard location, the path of StarPU'slibrary must be specified in the @code{PKG_CONFIG_PATH} environment variable sothat @code{pkg-config} can find it. For example if StarPU was installed in@code{$prefix_dir}:@example% PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig@end exampleThe flags required to compile or link against StarPU are thenaccessible with the following commands:@example% pkg-config --cflags libstarpu  # options for the compiler% pkg-config --libs libstarpu    # options for the linker@end example@node Running a basic StarPU application@section Running a basic StarPU applicationBasic examples using StarPU have been built in the directory@code{$prefix_dir/lib/starpu/examples/}. You can for example run theexample @code{vector_scal}.@example% $prefix_dir/lib/starpu/examples/vector_scalBEFORE : First element was 1.000000AFTER First element is 3.140000%@end exampleWhen StarPU is used for the first time, the directory@code{$HOME/.starpu/} is created, performance models will be stored inthat directory.Please note that buses are benchmarked when StarPU is launched for thefirst time. This may take a few minutes, or less if @code{hwloc} isinstalled. This step is done only once per user and per machine.@node Kernel threads started by StarPU@section Kernel threads started by StarPUTODO: StarPU starts one thread per CPU core and binds them there, uses one ofthem per GPU. The application is not supposed to do computations in its ownthreads. TODO: add a StarPU function to bind an application thread (e.g. themain thread) to a dedicated core (and thus disable the corresponding StarPU CPUworker).@node Using accelerators@section Using acceleratorsWhen both CUDA and OpenCL drivers are enabled, StarPU will launch anOpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.This design choice was necessary as OpenCL and CUDA can not run at thesame time on the same NVIDIA GPU, as there is currently no interoperabilitybetween them.Details on how to specify devices running OpenCL and the ones runningCUDA are given in @ref{Enabling OpenCL}.@c ---------------------------------------------------------------------@c Basic Examples@c ---------------------------------------------------------------------@node Basic Examples@chapter Basic Examples@menu* Compiling and linking options::  * Hello World::                 Submitting Tasks* Scaling a Vector::            Manipulating Data* Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures* Task and Worker Profiling::   * Partitioning Data::           Partitioning Data* Performance model example::   * Theoretical lower bound on execution time::  * Insert Task Utility::          * More examples::               More examples shipped with StarPU* Debugging::                   When things go wrong.@end menu@node Compiling and linking options@section Compiling and linking optionsLet's suppose StarPU has been installed in the directory@code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},the variable @code{PKG_CONFIG_PATH} needs to be set. It is alsonecessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamiclibraries at runtime.@example% PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH% LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH@end exampleThe Makefile could for instance contain the following lines to define whichoptions must be given to the compiler and to the linker:@cartouche@exampleCFLAGS          +=      $$(pkg-config --cflags libstarpu)LDFLAGS         +=      $$(pkg-config --libs libstarpu)@end example@end cartouche@node Hello World@section Hello World@menu* Required Headers::            * Defining a Codelet::          * Submitting a Task::           * Execution of Hello World::    @end menuIn this section, we show how to implement a simple program that submits a task to StarPU.@node Required Headers@subsection Required HeadersThe @code{starpu.h} header should be included in any code using StarPU.@cartouche@smallexample#include <starpu.h>@end smallexample@end cartouche@node Defining a Codelet@subsection Defining a Codelet@cartouche@smallexamplestruct params @{    int i;    float f;@};void cpu_func(void *buffers[], void *cl_arg)@{    struct params *params = cl_arg;    printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);@}starpu_codelet cl =@{    .where = STARPU_CPU,    .cpu_func = cpu_func,    .nbuffers = 0@};@end smallexample@end cartoucheA codelet is a structure that represents a computational kernel. Such a codeletmay contain an implementation of the same kernel on different architectures(e.g. CUDA, Cell's SPU, x86, ...).The @code{nbuffers} field specifies the number of data buffers that aremanipulated by the codelet: here the codelet does not access or modify any datathat is controlled by our data management library. Note that the argumentpassed to the codelet (the @code{cl_arg} field of the @code{starpu_task}structure) does not count as a buffer since it is not managed by our datamanagement library, but just contain trivial parameters.@c TODO need a crossref to the proper description of "where" see bla for more ...We create a codelet which may only be executed on the CPUs. The @code{where}field is a bitmask that defines where the codelet may be executed. Here, the@code{STARPU_CPU} value means that only CPUs can execute this codelet(@pxref{Codelets and Tasks} for more details on this field).When a CPU core executes a codelet, it calls the @code{cpu_func} function,which @emph{must} have the following prototype:@code{void (*cpu_func)(void *buffers[], void *cl_arg);}In this example, we can ignore the first argument of this function which gives adescription of the input and output buffers (e.g. the size and the location ofthe matrices) since there is none.The second argument is a pointer to a buffer passed as anargument to the codelet by the means of the @code{cl_arg} field of the@code{starpu_task} structure.@c TODO rewrite so that it is a little clearer ?Be aware that this may be a pointer to a@emph{copy} of the actual buffer, and not the pointer given by the programmer:if the codelet modifies this buffer, there is no guarantee that the initialbuffer will be modified as well: this for instance implies that the buffercannot be used as a synchronization medium. If synchronization is needed, datahas to be registered to StarPU, see @ref{Scaling a Vector}.@node Submitting a Task@subsection Submitting a Task@cartouche@smallexamplevoid callback_func(void *callback_arg)@{    printf("Callback function (arg %x)\n", callback_arg);@}int main(int argc, char **argv)@{    /* @b{initialize StarPU} */    starpu_init(NULL);    struct starpu_task *task = starpu_task_create();    task->cl = &cl; /* @b{Pointer to the codelet defined above} */    struct params params = @{ 1, 2.0f @};    task->cl_arg = ¶ms;    task->cl_arg_size = sizeof(params);    task->callback_func = callback_func;    task->callback_arg = 0x42;    /* @b{starpu_task_submit will be a blocking call} */    task->synchronous = 1;    /* @b{submit the task to StarPU} */    starpu_task_submit(task);    /* @b{terminate StarPU} */    starpu_shutdown();    return 0;@}@end smallexample@end cartoucheBefore submitting any tasks to StarPU, @code{starpu_init} must be called. The@code{NULL} argument specifies that we use default configuration. Tasks cannotbe submitted after the termination of StarPU by a call to@code{starpu_shutdown}.In the example above, a task structure is allocated by a call to@code{starpu_task_create}. This function only allocates and fills thecorresponding structure with the default settings (@pxref{Codelets andTasks, starpu_task_create}), but it does not submit the task to StarPU.@c not really clear ;)The @code{cl} field is a pointer to the codelet which the task willexecute: in other words, the codelet structure describes which computationalkernel should be offloaded on the different architectures, and the taskstructure is a wrapper containing a codelet and the piece of data on which thecodelet should operate.The optional @code{cl_arg} field is a pointer to a buffer (of size@code{cl_arg_size}) with some parameters for the kerneldescribed by the codelet. For instance, if a codelet implements a computationalkernel that multiplies its input vector by a constant, the constant could bespecified by the means of this buffer, instead of registering it as a StarPUdata. It must however be noted that StarPU avoids making copy whenever possibleand rather passes the pointer as such, so the buffer which is pointed at mustkept allocated until the task terminates, and if several tasks are submittedwith various parameters, each of them must be given a pointer to their ownbuffer.Once a task has been executed, an optional callback function is be called.While the computational kernel could be offloaded on various architectures, thecallback function is always executed on a CPU. The @code{callback_arg}pointer is passed as an argument of the callback. The prototype of a callbackfunction must be:@code{void (*callback_function)(void *);}If the @code{synchronous} field is non-zero, task submission will besynchronous: the @code{starpu_task_submit} function will not return until thetask was executed. Note that the @code{starpu_shutdown} method does notguarantee that asynchronous tasks have been executed before it returns,@code{starpu_task_wait_for_all} can be used to that effect, or data can beunregistered (@code{starpu_data_unregister(vector_handle);}), which willimplicitly wait for all the tasks scheduled to work on it, unless explicitlydisabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or@code{starpu_data_set_sequential_consistency_flag}.@node Execution of Hello World@subsection Execution of Hello World@smallexample% make hello_worldcc $(pkg-config --cflags libstarpu)  $(pkg-config --libs libstarpu) hello_world.c -o hello_world% ./hello_worldHello world (params = @{1, 2.000000@} )Callback function (arg 42)@end smallexample@node Scaling a Vector@section Manipulating Data: Scaling a VectorThe previous example has shown how to submit tasks. In this section,we show how StarPU tasks can manipulate data. The full source code forthis example is given in @ref{Full source code for the 'Scaling a Vector' example}.@menu* Source code of Vector Scaling::  * Execution of Vector Scaling::  @end menu@node Source code of Vector Scaling@subsection Source code of Vector ScalingProgrammers can describe the data layout of their application so that StarPU isresponsible for enforcing data coherency and availability across the machine.Instead of handling complex (and non-portable) mechanisms to perform datamovements, programmers only declare which piece of data is accessed and/ormodified by a task, and StarPU makes sure that when a computational kernelstarts somewhere (e.g. on a GPU), its data are available locally.Before submitting those tasks, the programmer first needs to declare thedifferent pieces of data to StarPU using the @code{starpu_*_data_register}functions. To ease the development of applications for StarPU, it is possibleto describe multiple types of data layout. A type of data layout is called an@b{interface}. There are different predefined interfaces available in StarPU:here we will consider the @b{vector interface}.The following lines show how to declare an array of @code{NX} elements of type@code{float} using the vector interface:@cartouche@smallexamplefloat vector[NX];starpu_data_handle vector_handle;starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,                            sizeof(vector[0]));@end smallexample@end cartoucheThe first argument, called the @b{data handle}, is an opaque pointer whichdesignates the array in StarPU. This is also the structure which is used todescribe which data is used by a task. The second argument is the node numberwhere the data originally resides. Here it is 0 since the @code{vector} array is inthe main memory. Then comes the pointer @code{vector} where the data can be found in main memory,the number of elements in the vector and the size of each element.The following shows how to construct a StarPU task that will manipulate thevector and a constant factor.@cartouche@smallexamplefloat factor = 3.14;struct starpu_task *task = starpu_task_create();task->cl = &cl;                          /* @b{Pointer to the codelet defined below} */task->buffers[0].handle = vector_handle; /* @b{First parameter of the codelet} */task->buffers[0].mode = STARPU_RW;task->cl_arg = &factor;task->cl_arg_size = sizeof(factor);task->synchronous = 1;starpu_task_submit(task);@end smallexample@end cartoucheSince the factor is a mere constant float value parameter,it does not need a preliminary registration, andcan just be passed through the @code{cl_arg} pointer like in the previousexample.  The vector parameter is described by its handle.There are two fields in each element of the @code{buffers} array.@code{handle} is the handle of the data, and @code{mode} specifies how thekernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} forwrite-only and @code{STARPU_RW} for read and write access).The definition of the codelet can be written as follows:@cartouche@smallexamplevoid scal_cpu_func(void *buffers[], void *cl_arg)@{    unsigned i;    float *factor = cl_arg;    /* length of the vector */    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);    /* CPU copy of the vector pointer */    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);    for (i = 0; i < n; i++)        val[i] *= *factor;@}starpu_codelet cl = @{    .where = STARPU_CPU,    .cpu_func = scal_cpu_func,    .nbuffers = 1@};@end smallexample@end cartoucheThe first argument is an array that givesa description of all the buffers passed in the @code{task->buffers}@ array. Thesize of this array is given by the @code{nbuffers} field of the codeletstructure. For the sake of genericity, this array contains pointers to thedifferent interfaces describing each buffer.  In the case of the @b{vectorinterface}, the location of the vector (resp. its length) is accessible in the@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in aread-write fashion, any modification will automatically affect future accessesto this vector made by other tasks.The second argument of the @code{scal_cpu_func} function contains a pointer to theparameters of the codelet (given in @code{task->cl_arg}), so that we read theconstant factor from this pointer.@node Execution of Vector Scaling@subsection Execution of Vector Scaling@smallexample% make vector_scalcc $(pkg-config --cflags libstarpu)  $(pkg-config --libs libstarpu)  vector_scal.c   -o vector_scal% ./vector_scal0.000000 3.000000 6.000000 9.000000 12.000000@end smallexample@node Vector Scaling on an Hybrid CPU/GPU Machine@section Vector Scaling on an Hybrid CPU/GPU MachineContrary to the previous examples, the task submitted in this example may notonly be executed by the CPUs, but also by a CUDA device.@menu* Definition of the CUDA Kernel::  * Definition of the OpenCL Kernel::  * Definition of the Main Code::  * Execution of Hybrid Vector Scaling::  @end menu@node Definition of the CUDA Kernel@subsection Definition of the CUDA KernelThe CUDA implementation can be written as follows. It needs to be compiled witha CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be notedthat the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPUmemory, so that it can be passed as such to the @code{vector_mult_cuda} kernelcall.@cartouche@smallexample#include <starpu.h>static __global__ void vector_mult_cuda(float *val, unsigned n,                                        float factor)@{    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;    if (i < n)        val[i] *= factor;@}extern "C" void scal_cuda_func(void *buffers[], void *_args)@{    float *factor = (float *)_args;    /* length of the vector */    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);    /* CUDA copy of the vector pointer */    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);    unsigned threads_per_block = 64;    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}@}@end smallexample@end cartouche@node Definition of the OpenCL Kernel@subsection Definition of the OpenCL KernelThe OpenCL implementation can be written as follows. StarPU providestools to compile a OpenCL kernel stored in a file.@cartouche@smallexample__kernel void vector_mult_opencl(__global float* val, int nx, float factor)@{        const int i = get_global_id(0);        if (i < nx) @{                val[i] *= factor;        @}@}@end smallexample@end cartoucheSimilarly to CUDA, the pointer returned by @code{STARPU_VECTOR_GET_PTR} is herea device pointer, so that it is passed as such to the OpenCL kernel.@cartouche@smallexample#include <starpu.h>@i{#include <starpu_opencl.h>}@i{extern struct starpu_opencl_program programs;}void scal_opencl_func(void *buffers[], void *_args)@{    float *factor = _args;@i{    int id, devid, err;}@i{    cl_kernel kernel;}@i{    cl_command_queue queue;}@i{    cl_event event;}    /* length of the vector */    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);    /* OpenCL copy of the vector pointer */    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);@i{    id = starpu_worker_get_id();}@i{    devid = starpu_worker_get_devid(id);}@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}@i{    err = clSetKernelArg(kernel, 0, sizeof(val), &val);}@i{    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}@i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}@i{    @{}@i{        size_t global=1;}@i{        size_t local=1;}@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}@i{    @}}@i{    clFinish(queue);}@i{    starpu_opencl_collect_stats(event);}@i{    clReleaseEvent(event);}@i{    starpu_opencl_release_kernel(kernel);}@}@end smallexample@end cartouche@node Definition of the Main Code@subsection Definition of the Main CodeThe CPU implementation is the same as in the previous section.Here is the source of the main application. You can notice the value of thefield @code{where} for the codelet. We specify@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codeletcan be executed either on a CPU or on a CUDA or an OpenCL device.@cartouche@smallexample#include <starpu.h>#define NX 2048extern void scal_cuda_func(void *buffers[], void *_args);extern void scal_cpu_func(void *buffers[], void *_args);extern void scal_opencl_func(void *buffers[], void *_args);/* @b{Definition of the codelet} */static starpu_codelet cl = @{    .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */                                     /* @b{on a CUDA device, or on an OpenCL device} */    .cuda_func = scal_cuda_func;    .cpu_func = scal_cpu_func;    .opencl_func = scal_opencl_func;    .nbuffers = 1;@}#ifdef STARPU_USE_OPENCL/* @b{The compiled version of the OpenCL program} */struct starpu_opencl_program programs;#endifint main(int argc, char **argv)@{    float *vector;    int i, ret;    float factor=3.0;    struct starpu_task *task;    starpu_data_handle vector_handle;    starpu_init(NULL);                            /* @b{Initialising StarPU} */#ifdef STARPU_USE_OPENCL    starpu_opencl_load_opencl_from_file(            "examples/basic_examples/vector_scal_opencl_codelet.cl",            &programs, NULL);#endif    vector = malloc(NX*sizeof(vector[0]));    assert(vector);    for(i=0 ; i<NX ; i++) vector[i] = i;@end smallexample@end cartouche@cartouche@smallexample    /* @b{Registering data within StarPU} */    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,                                NX, sizeof(vector[0]));    /* @b{Definition of the task} */    task = starpu_task_create();    task->cl = &cl;    task->buffers[0].handle = vector_handle;    task->buffers[0].mode = STARPU_RW;    task->cl_arg = &factor;    task->cl_arg_size = sizeof(factor);@end smallexample@end cartouche@cartouche@smallexample    /* @b{Submitting the task} */    ret = starpu_task_submit(task);    if (ret == -ENODEV) @{            fprintf(stderr, "No worker may execute this task\n");            return 1;    @}@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?    /* @b{Waiting for its termination} */    starpu_task_wait_for_all();    /* @b{Update the vector in RAM} */    starpu_data_acquire(vector_handle, STARPU_R);@end smallexample@end cartouche@cartouche@smallexample    /* @b{Access the data} */    for(i=0 ; i<NX; i++) @{      fprintf(stderr, "%f ", vector[i]);    @}    fprintf(stderr, "\n");    /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */    starpu_data_release(vector_handle);    starpu_data_unregister(vector_handle);    starpu_shutdown();    return 0;@}@end smallexample@end cartouche@node Execution of Hybrid Vector Scaling@subsection Execution of Hybrid Vector ScalingThe Makefile given at the beginning of the section must be extended togive the rules to compile the CUDA source code. Note that the sourcefile of the OpenCL kernel does not need to be compiled now, it willbe compiled at run-time when calling the function@code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).@cartouche@smallexampleCFLAGS	+=	$(shell pkg-config --cflags libstarpu)LDFLAGS	+=	$(shell pkg-config --libs libstarpu)CC	=	gccvector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o%.o: %.cu       nvcc $(CFLAGS) $< -c $@clean:       rm -f vector_scal *.o@end smallexample@end cartouche@smallexample% make@end smallexampleand to execute it, with the default configuration:@smallexample% ./vector_scal0.000000 3.000000 6.000000 9.000000 12.000000@end smallexampleor for example, by disabling CPU devices:@smallexample% STARPU_NCPUS=0 ./vector_scal0.000000 3.000000 6.000000 9.000000 12.000000@end smallexampleor by disabling CUDA devices (which may permit to enable the use of OpenCL,see @ref{Using accelerators}):@smallexample% STARPU_NCUDA=0 ./vector_scal0.000000 3.000000 6.000000 9.000000 12.000000@end smallexample@node Task and Worker Profiling@section Task and Worker ProfilingA full example showing how to use the profiling API is available inthe StarPU sources in the directory @code{examples/profiling/}.@cartouche@smallexamplestruct starpu_task *task = starpu_task_create();task->cl = &cl;task->synchronous = 1;/* We will destroy the task structure by hand so that we can * query the profiling info before the task is destroyed. */task->destroy = 0;/* Submit and wait for completion (since synchronous was set to 1) */starpu_task_submit(task);/* The task is finished, get profiling information */struct starpu_task_profiling_info *info = task->profiling_info;/* How much time did it take before the task started ? */double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);/* How long was the task execution ? */double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);/* We don't need the task structure anymore */starpu_task_destroy(task);@end smallexample@end cartouche@cartouche@smallexample/* Display the occupancy of all workers during the test */int worker;for (worker = 0; worker < starpu_worker_get_count(); worker++)@{        struct starpu_worker_profiling_info worker_info;        int ret = starpu_worker_get_profiling_info(worker, &worker_info);        STARPU_ASSERT(!ret);        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);        float executing_ratio = 100.0*executing_time/total_time;        float sleeping_ratio = 100.0*sleeping_time/total_time;        char workername[128];        starpu_worker_get_name(worker, workername, 128);        fprintf(stderr, "Worker %s:\n", workername);        fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);        fprintf(stderr, "\texec time  : %.2lf ms (%.2f %%)\n", executing_time*1e-3,                executing_ratio);        fprintf(stderr, "\tblocked time  : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,                sleeping_ratio);@}@end smallexample@end cartouche@node Partitioning Data@section Partitioning DataAn existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:@cartouche@smallexampleint vector[NX];starpu_data_handle handle;/* Declare data to StarPU */starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));/* Partition the vector in PARTS sub-vectors */starpu_filter f =@{    .filter_func = starpu_block_filter_func_vector,    .nchildren = PARTS,    .get_nchildren = NULL,    .get_child_ops = NULL@};starpu_data_partition(handle, &f);@end smallexample@end cartouche@cartouche@smallexample/* Submit a task on each sub-vector */for (i=0; i<starpu_data_get_nb_children(handle); i++) @{    /* Get subdata number i (there is only 1 dimension) */    starpu_data_handle sub_handle = starpu_data_get_sub_data(handle, 1, i);    struct starpu_task *task = starpu_task_create();    task->buffers[0].handle = sub_handle;    task->buffers[0].mode = STARPU_RW;    task->cl = &cl;    task->synchronous = 1;    task->cl_arg = &factor;    task->cl_arg_size = sizeof(factor);    starpu_task_submit(task);@}@end smallexample@end cartouchePartitioning can be applied several times, see@code{examples/basic_examples/mult.c} and @code{examples/filters/}.@node Performance model example@section Performance model exampleTo achieve good scheduling, StarPU scheduling policies need to be able toestimate in advance the duration of a task. This is done by giving to codelets aperformance model. There are several kinds of performance models.@itemize@itemProviding an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),see for instance@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)@itemMeasured at runtime (STARPU_HISTORY_BASED model type). This assumes that for agiven set of data input/output sizes, the performance will always be about thesame. This is very true for regular kernels on GPUs for instance (<0.1% error),and just a bit less true on CPUs (~=1% error). This also assumes that there arefew different sets of data input/output sizes. StarPU will then keep record ofthe average time of previous executions on the various processing units, and useit as an estimation. History is done per task size, by using a hash of the inputand ouput sizes as an index.It will also save it in @code{~/.starpu/sampling/codelets}for further executions, and can be observed by using the@code{starpu_perfmodel_display} command.  The following is a small code example.@cartouche@smallexamplestatic struct starpu_perfmodel_t mult_perf_model = @{    .type = STARPU_HISTORY_BASED,    .symbol = "mult_perf_model"@};starpu_codelet cl = @{    .where = STARPU_CPU,    .cpu_func = cpu_mult,    .nbuffers = 3,    /* for the scheduling policy to be able to use performance models */    .model = &mult_perf_model@};@end smallexample@end cartouche@itemMeasured at runtime and refined by regression (STARPU_REGRESSION_*_BASEDmodel type). This still assumes performance regularity, but can workwith various data input sizes, by applying regression over observedexecution times. STARPU_REGRESSION_BASED uses an a*n^b regressionform, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise thanSTARPU_REGRESSION_BASED, but costs a lot more to compute)@itemProvided explicitly by the application (STARPU_PER_ARCH model type): the@code{.per_arch[i].cost_model} fields have to be filled with pointers tofunctions which return the expected duration of the task in micro-seconds, oneper architecture.@end itemizeHow to use schedulers which can benefit from such performance model is explainedin @ref{Task scheduling policy}.The same can be done for task power consumption estimation, by setting the@code{power_model} field the same way as the @code{model} field. Note: fornow, the application has to give to the power consumption performance modela name which is different from the execution time performance model.@node Theoretical lower bound on execution time@section Theoretical lower bound on execution timeFor kernels with history-based performance models, StarPU can very easily provide a theoretical lowerbound for the execution time of a whole set of tasks. See forinstance @code{examples/lu/lu_example.c}: before submitting tasks,call @code{starpu_bound_start}, and after complete execution, call@code{starpu_bound_stop}. @code{starpu_bound_print_lp} or@code{starpu_bound_print_mps} can then be used to output a Linear Programmingproblem corresponding to the schedule of your tasks. Run it through@code{lp_solve} or any other linear programming solver, and that will give you alower bound for the total execution time of your tasks. If StarPU was compiledwith the glpk library installed, @code{starpu_bound_compute} can be used tosolve it immediately and get the optimized minimum. Its @code{integer}parameter allows to decide whether integer resolution should be computedand returned.The @code{deps} parameter tells StarPU whether to take tasks and implicit datadependencies into account. It must be understood that the linear programmingproblem size is quadratic with the number of tasks and thus the time to solve itwill be very long, it could be minutes for just a few dozen tasks. You shouldprobably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert theproblem to MPS format and then use a better solver, @code{glpsol} might bebetter than @code{lp_solve} for instance (the @code{--pcost} option may beuseful), but sometimes doesn't manage to converge. @code{cbc} might lookslower, but it is parallel. Be sure to try at least all the @code{-B} optionsof @code{lp_solve}. For instance, we often just use@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , andthe @code{-gr} option can also be quite useful.Setting @code{deps} to 0 will only take into account the actual computationson processing units. It however still properly takes into account the varyingperformances of kernels and processing units, which is quite more accurate thanjust comparing StarPU performances with the fastest of the kernels being used.The @code{prio} parameter tells StarPU whether to simulate taking into accountthe priorities as the StarPU scheduler would, i.e. schedule prioritizedtasks before less prioritized tasks, to check to which extend this resultsto a less optimal solution. This increases even more computation time.Note that for simplicity, all this however doesn't take into account datatransfers, which are assumed to be completely overlapped.@node Insert Task Utility@section Insert Task UtilityStarPU provides the wrapper function @code{starpu_insert_task} to easethe creation and submission of tasks.@deftypefun void starpu_insert_task (starpu_codelet *@var{cl}, ...)Create and submit a task corresponding to @var{cl} with the followingarguments.  The argument list must be zero-terminated.The arguments following the codelets can be of the following types:@itemize@item@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;@item@code{STARPU_VALUE} followed  by a pointer to a constant value andthe size of the constant;@item@code{STARPU_CALLBACK} followed by a pointer to a callback function;@item@code{STARPU_CALLBACK_ARG} followed by a pointer to be given as anargument to the callback function;@item@code{STARPU_PRIORITY} followed by a integer defining a priority level.@end itemizeParameters to be passed to the codelet implementation are definedthrough the type @code{STARPU_VALUE}. The function@code{starpu_unpack_cl_args} must be called within the codeletimplementation to retrieve them.@end deftypefunHere the implementation of the codelet:@smallexamplevoid func_cpu(void *descr[], void *_args)@{        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);        int ifactor;        float ffactor;        starpu_unpack_cl_args(_args, &ifactor, &ffactor);        *x0 = *x0 * ifactor;        *x1 = *x1 * ffactor;@}starpu_codelet mycodelet = @{        .where = STARPU_CPU,        .cpu_func = func_cpu,        .nbuffers = 2@};@end smallexampleAnd the call to the @code{starpu_insert_task} wrapper:@smallexamplestarpu_insert_task(&mycodelet,                   STARPU_VALUE, &ifactor, sizeof(ifactor),                   STARPU_VALUE, &ffactor, sizeof(ffactor),                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],                   0);@end smallexampleThe call to @code{starpu_insert_task} is equivalent to the followingcode:@smallexamplestruct starpu_task *task = starpu_task_create();task->cl = &mycodelet;task->buffers[0].handle = data_handles[0];task->buffers[0].mode = STARPU_RW;task->buffers[1].handle = data_handles[1];task->buffers[1].mode = STARPU_RW;char *arg_buffer;size_t arg_buffer_size;starpu_pack_cl_args(&arg_buffer, &arg_buffer_size,		    STARPU_VALUE, &ifactor, sizeof(ifactor),		    STARPU_VALUE, &ffactor, sizeof(ffactor),		    0);task->cl_arg = arg_buffer;task->cl_arg_size = arg_buffer_size;int ret = starpu_task_submit(task);@end smallexample@node Debugging@section DebuggingStarPU provides several tools to help debugging aplications. Execution tracescan be generated and displayed graphically, see @ref{Generating traces}. Somegdb helpers are also provided to show the whole StarPU state:@smallexample(gdb) source tools/gdbinit(gdb) help starpu@end smallexample@node More examples@section More examplesMore examples are available in the StarPU sources in the @code{examples/}directory. Simple examples include:@table @asis@item @code{incrementer/}:	Trivial incrementation test.@item @code{basic_examples/}:        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix        product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data        interface, and an example using the variable data interface.@item @code{matvecmult/}:	OpenCL example from NVidia, adapted to StarPU.@item @code{axpy/}:	AXPY CUBLAS operation adapted to StarPU.@item @code{fortran/}:	Example of Fortran bindings.@end tableMore advanced examples include:@table @asis@item @code{filters/}:	Examples using filters, as shown in @ref{Partitioning Data}.@item @code{lu/}:	LU matrix factorization, see for instance @code{xlu_implicit.c}@item @code{cholesky/}:	Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.@end table@c ---------------------------------------------------------------------@c Performance options@c ---------------------------------------------------------------------@node Performance optimization@chapter How to optimize performance with StarPUTODO: improve!@menu* Data management::* Task submission::* Task priorities::* Task scheduling policy::* Task distribution vs Data transfer::* Data prefetch::* Power-based scheduling::* Profiling::* CUDA-specific optimizations::@end menuSimply encapsulating application kernels into tasks already permits toseamlessly support CPU and GPUs at the same time. To achieve good performance, afew additional changes are needed.@node Data management@section Data managementBy default, StarPU leaves replicates of data wherever they were used, in case theywill be re-used by other tasks, thus saving the data transfer time. When sometask modifies some data, all the other replicates are invalidated, and only theprocessing unit will have a valid replicate of the data. If the application knowsthat this data will not be re-used by further tasks, it should advise StarPU toimmediately replicate it to a desired list of memory nodes (given through abitmask). This can be understood like the write-through mode of CPU caches.@examplestarpu_data_set_wt_mask(img_handle, 1<<0);@end examplewill for instance request to always transfer a replicate into the main memory (node0), as bit 0 of the write-through bitmask is being set.When the application allocates data, whenever possible it should use the@code{starpu_malloc} function, which will ask CUDA orOpenCL to make the allocation itself and pin the corresponding allocatedmemory. This is needed to permit asynchronous data transfer, i.e. permit datatransfer to overlap with computations.@node Task submission@section Task submissionTo let StarPU make online optimizations, tasks should be submittedasynchronously as much as possible. Ideally, all the tasks should besubmitted, and mere calls to @code{starpu_task_wait_for_all} or@code{starpu_data_unregister} be done to wait fortermination. StarPU will then be able to rework the whole schedule, overlapcomputation with communication, manage accelerator local memory usage, etc.@node Task priorities@section Task prioritiesBy default, StarPU will consider the tasks in the order they are submitted bythe application. If the application programmer knows that some tasks shouldbe performed in priority (for instance because their output is needed by manyother tasks and may thus be a bottleneck if not executed early enough), the@code{priority} field of the task structure should be set to transmit thepriority information to StarPU.@node Task scheduling policy@section Task scheduling policyBy default, StarPU uses the @code{eager} simple greedy scheduler. This isbecause it provides correct load balance even if the application codelets do nothave performance models. If your application codelets have performance models(@pxref{Performance model example} for examples showing how to do it),you should change the scheduler thanks to the @code{STARPU_SCHED} environmentvariable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to getthe list of available schedulers.@c TODO: give some details about each scheduler.Most schedulers are based on an estimation of codelet duration on each kindof processing unit. For this to be possible, the application programmer needsto configure a performance model for the codelets of the application (see@ref{Performance model example} for instance). History-based performance modelsuse on-line calibration.  StarPU will automatically calibrate codeletswhich have never been calibrated yet. To force continuing calibration, use@code{export STARPU_CALIBRATE=1} . This may be necessary if your applicationhave not-so-stable performance. Details on the current performance model statuscan be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}option lists the available performance models, and the @code{-s} option permitsto choose the performance model to be displayed. The result looks like:@example€ starpu_perfmodel_display -s starpu_dlu_lu_model_22performance model for cpu# hash		size		mean		dev		n5c6c3401	1572864        	1.216300e+04   	2.277778e+03   	1240@end exampleWhich shows that for the LU 22 kernel with a 1.5MiB matrix, the averageexecution time on CPUs was about 12ms, with a 2ms standard deviation, over1240 samples. It is a good idea to check this before doing actual performancemeasurements.If a kernel source code was modified (e.g. performance improvement), thecalibration information is stale and should be dropped, to re-calibrate fromstart. This can be done by using @code{export STARPU_CALIBRATE=2}.Note: due to CUDA limitations, to be able to measure kernel duration,calibration mode needs to disable asynchronous data transfers. Calibration thusdisables data transfer / computation overlapping, and should thus not be usedfor eventual benchmarks. Note 2: history-based performance models get calibratedonly if a performance-model-based scheduler is chosen.@node Task distribution vs Data transfer@section Task distribution vs Data transferDistributing tasks to balance the load induces data transfer penalty. StarPUthus needs to find a balance between both. The target function that the@code{dmda} scheduler of StarPUtries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where@code{T_execution} is the estimated execution time of the codelet (usuallyaccurate), and @code{T_data_transfer} is the estimated data transfer time. Thelatter is however estimated based on bus calibration before execution start,i.e. with an idle machine. You can force bus re-calibration by running@code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can beworth trying to tweak it by using @code{export STARPU_BETA=2} for instance.This is of course imprecise, but in practice, a rough estimation already givesthe good results that a precise estimation would give.@node Data prefetch@section Data prefetchThe @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):as soon as a scheduling decision is taken for a task, requests are issued totransfer its required data to the target processing unit, if needeed, so thatwhen the processing unit actually starts the task, its data will hopefully bealready available and it will not have to wait for the transfer to finish.The application may want to perform some manual prefetching, for several reasonssuch as excluding initial data transfers from performance measurements, orsetting up an initial statically-computed data distribution on the machinebefore submitting tasks, which will thus guide StarPU toward an initial taskdistribution (since StarPU will try to avoid further transfers).This can be achieved by giving the @code{starpu_data_prefetch_on_node} functionthe handle and the desired target memory node.@node Power-based scheduling@section Power-based schedulingIf the application can provide some power performance model (throughthe @code{power_model} field of the codelet structure), StarPU willtake it into account when distributing tasks. The target function thatthe @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}is the estimated task consumption in Joules. To tune this parameter, use@code{export STARPU_GAMMA=3000} for instance, to express that each Joule(i.e kW during 1000us) is worth 3000us execution time penalty. Settingalpha and beta to zero permits to only take into account power consumption.This is however not sufficient to correctly optimize power: the scheduler wouldsimply tend to run all computations on the most energy-conservative processingunit. To account for the consumption of the whole machine (including idleprocessing units), the idle power of the machine should be given by setting@code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can oftenbe obtained from the machine power supplier.The power actually consumed by the total execution can be displayed by setting@code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .@node Profiling@section ProfilingA quick view of how many tasks each worker has executed can be obtained by setting @code{export STARPU_WORKER_STATS=1} This is a convenient way to check thatexecution did happen on accelerators without penalizing performance withthe profiling overhead.More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or bycalling @code{starpu_profiling_status_set} from the source code.Statistics on the execution can then be obtained by using @code{exportSTARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} . More details on performance feedback are provided by the next chapter.@node CUDA-specific optimizations@section CUDA-specific optimizationsDue to CUDA limitations, StarPU will have a hard time overlapping its owncommunications and the codelet computations if the application does not use adedicated CUDA stream for its computations. StarPU provides one by the use of@code{starpu_cuda_get_local_stream()} which should be used by all CUDA codeletoperations. For instance:@examplefunc <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);cudaStreamSynchronize(starpu_cuda_get_local_stream());@end exampleUnfortunately, a lot of CUDA libraries do not have stream variants ofkernels. That will lower the potential for overlapping.@c ---------------------------------------------------------------------@c Performance feedback@c ---------------------------------------------------------------------@node Performance feedback@chapter Performance feedback@menu* On-line::       On-line performance feedback* Off-line::      Off-line performance feedback* Codelet performance::      Performance of codelets@end menu@node On-line@section On-line performance feedback@menu* Enabling monitoring::     Enabling on-line performance monitoring* Task feedback::           Per-task feedback* Codelet feedback::        Per-codelet feedback* Worker feedback::         Per-worker feedback* Bus feedback::            Bus-related feedback@end menu@node Enabling monitoring@subsection Enabling on-line performance monitoringIn order to enable online performance monitoring, the application can call@code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible todetect whether monitoring is already enabled or not by calling@code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize allpreviously collected feedback. The @code{STARPU_PROFILING} environment variablecan also be set to 1 to achieve the same effect.Likewise, performance monitoring is stopped by calling@code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that thisdoes not reset the performance counters so that the application may consultthem later on.More details about the performance monitoring API are available in section@ref{Profiling API}.@node Task feedback@subsection Per-task feedbackIf profiling is enabled, a pointer to a @code{starpu_task_profiling_info}structure is put in the @code{.profiling_info} field of the @code{starpu_task}structure when a task terminates.This structure is automatically destroyed when the task structure is destroyed,either automatically or by calling @code{starpu_task_destroy}.The @code{starpu_task_profiling_info} structure indicates the date when thetask was submitted (@code{submit_time}), started (@code{start_time}), andterminated (@code{end_time}), relative to the initialization ofStarPU with @code{starpu_init}. It also specifies the identifier of the workerthat has executed the task (@code{workerid}).These date are stored as @code{timespec} structures which the user may convertinto micro-seconds using the @code{starpu_timing_timespec_to_us} helperfunction.It it worth noting that the application may directly access this structure fromthe callback executed at the end of the task. The @code{starpu_task} structureassociated to the callback currently being executed is indeed accessible withthe @code{starpu_get_current_task()} function.@node Codelet feedback@subsection Per-codelet feedbackThe @code{per_worker_stats} field of the @code{starpu_codelet_t} structure isan array of counters. The i-th entry of the array is incremented every time atask implementing the codelet is executed on the i-th worker.This array is not reinitialized when profiling is enabled or disabled.@node Worker feedback@subsection Per-worker feedbackThe second argument returned by the @code{starpu_worker_get_profiling_info}function is a @code{starpu_worker_profiling_info} structure that givesstatistics about the specified worker. This structure specifies when StarPUstarted collecting profiling information for that worker (@code{start_time}),the duration of the profiling measurement interval (@code{total_time}), thetime spent executing kernels (@code{executing_time}), the time spent sleepingbecause there is no task to execute at all (@code{sleeping_time}), and thenumber of tasks that were executed while profiling was enabled.These values give an estimation of the proportion of time spent do real work,and the time spent either sleeping because there are not enough executabletasks or simply wasted in pure StarPU overhead. Calling @code{starpu_worker_get_profiling_info} resets the profilinginformation associated to a worker.When an FxT trace is generated (see @ref{Generating traces}), it is alsopossible to use the @code{starpu_top} script (described in @ref{starpu-top}) togenerate a graphic showing the evolution of these values during the time, forthe different workers.@node Bus feedback@subsection Bus-related feedback TODO@c how to enable/disable performance monitoring@c what kind of information do we get ?@node Off-line@section Off-line performance feedback@menu* Generating traces::       Generating traces with FxT* Gantt diagram::           Creating a Gantt Diagram* DAG::                     Creating a DAG with graphviz* starpu-top::              Monitoring activity@end menu@node Generating traces@subsection Generating traces with FxTStarPU can use the FxT library (see@indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traceswith a limited runtime overhead.You can either get the FxT library from CVS (autotools are required):@example% cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT% ./bootstrap@end exampleIf autotools are not available on your machine, or if you prefer to do so,FxT's code is also available as a tarball:@example% wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz@end exampleCompiling and installing the FxT library in the @code{$FXTDIR} path isdone following the standard procedure:@example% ./configure --prefix=$FXTDIR% make% make install@end exampleIn order to have StarPU to generate traces, StarPU should be configured withthe @code{--with-fxt} option:@example$ ./configure --with-fxt=$FXTDIR@end exampleWhen FxT is enabled, a trace is generated when StarPU is terminated by calling@code{starpu_shutdown()}). The trace is a binary file whose name has the form@code{prof_file_XXX_YYY} where @code{XXX} is the user name, and@code{YYY} is the pid of the process that used StarPU. This file is saved in the@code{/tmp/} directory by default, or by the directory specified bythe @code{STARPU_FXT_PREFIX} environment variable.@node Gantt diagram@subsection Creating a Gantt DiagramWhen the FxT trace file @code{filename} has been generated, it is possible togenerate a trace in the Paje format by calling:@example% starpu_fxt_tool -i filename@end exampleOr alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variableto 1 before application execution will make StarPU do it automatically atapplication shutdown.This will create a @code{paje.trace} file in the current directory that can beinspected with the ViTE trace visualizing open-source tool. More informationabout ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It ispossible to open the @code{paje.trace} file with ViTE by using the followingcommand:@example% vite paje.trace@end example@node DAG@subsection Creating a DAG with graphvizWhen the FxT trace file @code{filename} has been generated, it is possible togenerate a task graph in the DOT format by calling:@example$ starpu_fxt_tool -i filename@end exampleThis will create a @code{dag.dot} file in the current directory. This file is atask graph described using the DOT language. It is possible to get agraphical output of the graph by using the graphviz library:@example$ dot -Tpdf dag.dot -o output.pdf@end example@node starpu-top@subsection Monitoring activityWhen the FxT trace file @code{filename} has been generated, it is possible togenerate a activity trace by calling:@example$ starpu_fxt_tool -i filename@end exampleThis will create an @code{activity.data} file in the currentdirectory. A profile of the application showing the activity of StarPUduring the execution of the program can be generated:@example$ starpu_top.sh activity.data@end exampleThis will create a file named @code{activity.eps} in the current directory.This picture is composed of two parts.The first part shows the activity of the different workers. The green sectionsindicate which proportion of the time was spent executed kernels on theprocessing unit. The red sections indicate the proportion of time spent inStartPU: an important overhead may indicate that the granularity may be toolow, and that bigger tasks may be appropriate to use the processing unit moreefficiently. The black sections indicate that the processing unit was blockedbecause there was no task to process: this may indicate a lack of parallelismwhich may be alleviated by creating more tasks when it is possible.The second part of the @code{activity.eps} picture is a graph showing theevolution of the number of tasks available in the system during the execution.Ready tasks are shown in black, and tasks that are submitted but notschedulable yet are shown in grey.@node Codelet performance@section Performance of codeletsThe performance model of codelets can be examined by using the@code{starpu_perfmodel_display} tool:@example$ starpu_perfmodel_display -lfile: <malloc_pinned.hannibal>file: <starpu_slu_lu_model_21.hannibal>file: <starpu_slu_lu_model_11.hannibal>file: <starpu_slu_lu_model_22.hannibal>file: <starpu_slu_lu_model_12.hannibal>@end exampleHere, the codelets of the lu example are available. We can examine theperformance of the 22 kernel:@example$ starpu_perfmodel_display -s starpu_slu_lu_model_22performance model for cpu# hash		size		mean		dev		n57618ab0	19660800       	2.851069e+05   	1.829369e+04   	109performance model for cuda_0# hash		size		mean		dev		n57618ab0	19660800       	1.164144e+04   	1.556094e+01   	315performance model for cuda_1# hash		size		mean		dev		n57618ab0	19660800       	1.164271e+04   	1.330628e+01   	360performance model for cuda_2# hash		size		mean		dev		n57618ab0	19660800       	1.166730e+04   	3.390395e+02   	456@end exampleWe can see that for the given size, over a sample of a few hundreds ofexecution, the GPUs are about 20 times faster than the CPUs (numbers are inus). The standard deviation is extremely low for the GPUs, and less than 10% forCPUs.@c ---------------------------------------------------------------------@c MPI support@c ---------------------------------------------------------------------@node StarPU MPI support@chapter StarPU MPI supportTODO: document include/starpu_mpi.h and explain a simple example (pingpong?)@c ---------------------------------------------------------------------@c Configuration options@c ---------------------------------------------------------------------@node Configuring StarPU@chapter Configuring StarPU@menu* Compilation configuration::   * Execution configuration through environment variables::  @end menu@node Compilation configuration@section Compilation configurationThe following arguments can be given to the @code{configure} script.@menu* Common configuration::        * Configuring workers::         * Advanced configuration::      @end menu@node Common configuration@subsection Common configuration@menu* --enable-debug::              * --enable-fast::               * --enable-verbose::            * --enable-coverage::           @end menu@node --enable-debug@subsubsection @code{--enable-debug}@table @asis@item @emph{Description}:Enable debugging messages.@end table@node --enable-fast@subsubsection @code{--enable-fast}@table @asis@item @emph{Description}:Do not enforce assertions, saves a lot of time spent to compute them otherwise.@end table@node --enable-verbose@subsubsection @code{--enable-verbose}@table @asis@item @emph{Description}:Augment the verbosity of the debugging messages. This can be disabledat runtime by setting the environment variable @code{STARPU_SILENT} toany value.@smallexample% STARPU_SILENT=1 ./vector_scal@end smallexample@end table@node --enable-coverage@subsubsection @code{--enable-coverage}@table @asis@item @emph{Description}:Enable flags for the @code{gcov} coverage tool.@end table@node Configuring workers@subsection Configuring workers@menu* --enable-nmaxcpus::         * --disable-cpu::               * --enable-maxcudadev::         * --disable-cuda::              * --with-cuda-dir::             * --with-cuda-include-dir::             * --with-cuda-lib-dir::             * --enable-maxopencldev::       * --disable-opencl::            * --with-opencl-dir::           * --with-opencl-include-dir::           * --with-opencl-lib-dir::           * --enable-gordon::             * --with-gordon-dir::           @end menu@node --enable-nmaxcpus@subsubsection @code{--enable-nmaxcpus=<number>}@table @asis@item @emph{Description}:Defines the maximum number of CPU cores that StarPU will support, thenavailable as the @code{STARPU_NMAXCPUS} macro.@end table@node --disable-cpu@subsubsection @code{--disable-cpu}@table @asis@item @emph{Description}:Disable the use of CPUs of the machine. Only GPUs etc. will be used.@end table@node --enable-maxcudadev@subsubsection @code{--enable-maxcudadev=<number>}@table @asis@item @emph{Description}:Defines the maximum number of CUDA devices that StarPU will support, thenavailable as the @code{STARPU_MAXCUDADEVS} macro.@end table@node --disable-cuda@subsubsection @code{--disable-cuda}@table @asis@item @emph{Description}:Disable the use of CUDA, even if a valid CUDA installation was detected.@end table@node --with-cuda-dir@subsubsection @code{--with-cuda-dir=<path>}@table @asis@item @emph{Description}:Specify the directory where CUDA is installed. This directory should notably contain@code{include/cuda.h}.@end table@node --with-cuda-include-dir@subsubsection @code{--with-cuda-include-dir=<path>}@table @asis@item @emph{Description}:Specify the directory where CUDA headers are installed. This directory shouldnotably contain @code{cuda.h}. This defaults to @code{/include} appended to thevalue given to @code{--with-cuda-dir}.@end table@node --with-cuda-lib-dir@subsubsection @code{--with-cuda-lib-dir=<path>}@table @asis@item @emph{Description}:Specify the directory where the CUDA library is installed. This directory shouldnotably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to@code{/lib} appended to the value given to @code{--with-cuda-dir}.@end table@node --enable-maxopencldev@subsubsection @code{--enable-maxopencldev=<number>}@table @asis@item @emph{Description}:Defines the maximum number of OpenCL devices that StarPU will support, thenavailable as the @code{STARPU_MAXOPENCLDEVS} macro.@end table@node --disable-opencl@subsubsection @code{--disable-opencl}@table @asis@item @emph{Description}:Disable the use of OpenCL, even if the SDK is detected.@end table@node --with-opencl-dir@subsubsection @code{--with-opencl-dir=<path>}@table @asis@item @emph{Description}:Specify the location of the OpenCL SDK. This directory should notably contain@code{include/CL/cl.h} (or @code{include/OpenCL/cl.h} on Mac OS).@end table@node --with-opencl-include-dir@subsubsection @code{--with-opencl-include-dir=<path>}@table @asis@item @emph{Description}:Specify the location of OpenCL headers. This directory should notably contain@code{CL/cl.h} (or @code{OpenCL/cl.h} on Mac OS). This defaults to@code{/include} appended to the value given to @code{--with-opencl-dir}.@end table@node --with-opencl-lib-dir@subsubsection @code{--with-opencl-lib-dir=<path>}@table @asis@item @emph{Description}:Specify the location of the OpenCL library. This directory should notablycontain the OpenCL shared libraries (e.g. libOpenCL.so). This defaults to@code{/lib} appended to the value given to @code{--with-opencl-dir}.@end table@node --enable-gordon@subsubsection @code{--enable-gordon}@table @asis@item @emph{Description}:Enable the use of the Gordon runtime for Cell SPUs.@c TODO: rather default to enabled when detected@end table@node --with-gordon-dir@subsubsection @code{--with-gordon-dir=<path>}@table @asis@item @emph{Description}:Specify the location of the Gordon SDK.@end table@node Advanced configuration@subsection Advanced configuration@menu* --enable-perf-debug::         * --enable-model-debug::        * --enable-stats::              * --enable-maxbuffers::         * --enable-allocation-cache::   * --enable-opengl-render::      * --enable-blas-lib::           * --with-magma::                * --with-fxt::                  * --with-perf-model-dir::       * --with-mpicc::                * --with-goto-dir::             * --with-atlas-dir::            * --with-mkl-cflags::* --with-mkl-ldflags::@end menu@node --enable-perf-debug@subsubsection @code{--enable-perf-debug}@table @asis@item @emph{Description}:Enable performance debugging.@end table@node --enable-model-debug@subsubsection @code{--enable-model-debug}@table @asis@item @emph{Description}:Enable performance model debugging.@end table@node --enable-stats@subsubsection @code{--enable-stats}@table @asis@item @emph{Description}:Enable statistics.@end table@node --enable-maxbuffers@subsubsection @code{--enable-maxbuffers=<nbuffers>}@table @asis@item @emph{Description}:Define the maximum number of buffers that tasks will be able to takeas parameters, then available as the @code{STARPU_NMAXBUFS} macro.@end table@node --enable-allocation-cache@subsubsection @code{--enable-allocation-cache}@table @asis@item @emph{Description}:Enable the use of a data allocation cache to avoid the cost of it withCUDA. Still experimental.@end table@node --enable-opengl-render@subsubsection @code{--enable-opengl-render}@table @asis@item @emph{Description}:Enable the use of OpenGL for the rendering of some examples.@c TODO: rather default to enabled when detected@end table@node --enable-blas-lib@subsubsection @code{--enable-blas-lib=<name>}@table @asis@item @emph{Description}:Specify the blas library to be used by some of the examples. Thelibrary has to be 'atlas' or 'goto'.@end table@node --with-magma@subsubsection @code{--with-magma=<path>}@table @asis@item @emph{Description}:Specify where magma is installed. This directory should notably contain@code{include/magmablas.h}.@end table@node --with-fxt@subsubsection @code{--with-fxt=<path>}@table @asis@item @emph{Description}:Specify the location of FxT (for generating traces and rendering themusing ViTE). This directory should notably contain@code{include/fxt/fxt.h}.@c TODO add ref to other section@end table@node --with-perf-model-dir@subsubsection @code{--with-perf-model-dir=<dir>}@table @asis@item @emph{Description}:Specify where performance models should be stored (instead of defaulting to thecurrent user's home).@end table@node --with-mpicc@subsubsection @code{--with-mpicc=<path to mpicc>}@table @asis@item @emph{Description}:Specify the location of the @code{mpicc} compiler to be used for starpumpi.@end table@node --with-goto-dir@subsubsection @code{--with-goto-dir=<dir>}@table @asis@item @emph{Description}:Specify the location of GotoBLAS.@end table@node --with-atlas-dir@subsubsection @code{--with-atlas-dir=<dir>}@table @asis@item @emph{Description}:Specify the location of ATLAS. This directory should notably contain@code{include/cblas.h}.@end table@node --with-mkl-cflags@subsubsection @code{--with-mkl-cflags=<cflags>}@table @asis@item @emph{Description}:Specify the compilation flags for the MKL Library.@end table@node --with-mkl-ldflags@subsubsection @code{--with-mkl-ldflags=<ldflags>}@table @asis@item @emph{Description}:Specify the linking flags for the MKL Library. Note that the@url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/}website provides a script to determine the linking flags.@end table@c ---------------------------------------------------------------------@c Environment variables@c ---------------------------------------------------------------------@node Execution configuration through environment variables@section Execution configuration through environment variables@menu* Workers::                     Configuring workers* Scheduling::                  Configuring the Scheduling engine* Misc::                        Miscellaneous and debug@end menuNote: the values given in @code{starpu_conf} structure passed whencalling @code{starpu_init} will override the values of the environmentvariables.@node Workers@subsection Configuring workers@menu* STARPU_NCPUS::                Number of CPU workers* STARPU_NCUDA::                Number of CUDA workers* STARPU_NOPENCL::              Number of OpenCL workers* STARPU_NGORDON::              Number of SPU workers (Cell)* STARPU_WORKERS_CPUID::        Bind workers to specific CPUs* STARPU_WORKERS_CUDAID::       Select specific CUDA devices* STARPU_WORKERS_OPENCLID::     Select specific OpenCL devices@end menu@node STARPU_NCPUS@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers@table @asis@item @emph{Description}:Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocatemore CPU workers than there are physical CPUs, and that some CPUs are used to controlthe accelerators.@end table@node STARPU_NCUDA@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers@table @asis@item @emph{Description}:Specify the number of CUDA devices that StarPU can use. If@code{STARPU_NCUDA} is lower than the number of physical devices, it ispossible to select which CUDA devices should be used by the means of the@code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU willcreate as many CUDA workers as there are CUDA devices.@end table@node STARPU_NOPENCL@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers@table @asis@item @emph{Description}:OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.@end table@node STARPU_NGORDON@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)@table @asis@item @emph{Description}:Specify the number of SPUs that StarPU can use.@end table@node STARPU_WORKERS_CPUID@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs@table @asis@item @emph{Description}:Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}specifies on which logical CPU the different workers should bebound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the firstworker will be bound to logical CPU #0, the second CPU worker will be bound tological CPU #1 and so on.  Note that the logical ordering of the CPUs is eitherdetermined by the OS, or provided by the @code{hwloc} library in case it isavailable.Note that the first workers correspond to the CUDA workers, then come theOpenCL and the SPU, and finally the CPU workers. For example ifwe have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlledby logical CPU #0, the OpenCL device will be controlled by logical CPU #2, andthe logical CPUs #1 and #3 will be used by the CPU workers.If the number of workers is larger than the array given in@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in around-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and thethird (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).This variable is ignored if the @code{use_explicit_workers_bindid} flag of the@code{starpu_conf} structure passed to @code{starpu_init} is set.@end table@node STARPU_WORKERS_CUDAID@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices@table @asis@item @emph{Description}:Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it ispossible to select which CUDA devices should be used by StarPU. On a machineequipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and thatthey should use CUDA devices #1 and #3 (the logical ordering of the devices isthe one reported by CUDA).This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag ofthe @code{starpu_conf} structure passed to @code{starpu_init} is set.@end table@node STARPU_WORKERS_OPENCLID@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices@table @asis@item @emph{Description}:OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag ofthe @code{starpu_conf} structure passed to @code{starpu_init} is set.@end table@node Scheduling@subsection Configuring the Scheduling engine@menu* STARPU_SCHED::                Scheduling policy* STARPU_CALIBRATE::            Calibrate performance models* STARPU_PREFETCH::             Use data prefetch* STARPU_SCHED_ALPHA::          Computation factor* STARPU_SCHED_BETA::           Communication factor@end menu@node STARPU_SCHED@subsubsection @code{STARPU_SCHED} -- Scheduling policy@table @asis@item @emph{Description}:This chooses between the different scheduling policies proposed by StarPU: workrandom, stealing, greedy, with performance models, etc.Use @code{STARPU_SCHED=help} to get the list of available schedulers.@end table@node STARPU_CALIBRATE@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models@table @asis@item @emph{Description}:If this variable is set to 1, the performance models are calibrated duringthe execution. If it is set to 2, the previous values are dropped to restartcalibration from scratch. Setting this variable to 0 disable calibration, thisis the default behaviour.Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.@end table@node STARPU_PREFETCH@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch@table @asis@item @emph{Description}:This variable indicates whether data prefetching should be enabled (0 meansthat it is disabled). If prefetching is enabled, when a task is scheduled to beexecuted e.g. on a GPU, StarPU will request an asynchronous transfer inadvance, so that data is already present on the GPU when the task starts. As aresult, computation and data transfers are overlapped.Note that prefetching is enabled by default in StarPU.@end table@node STARPU_SCHED_ALPHA@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor@table @asis@item @emph{Description}:To estimate the cost of a task StarPU takes into account the estimatedcomputation time (obtained thanks to performance models). The alpha factor isthe coefficient to be applied to it before adding it to the communication part.@end table@node STARPU_SCHED_BETA@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor@table @asis@item @emph{Description}:To estimate the cost of a task StarPU takes into account the estimateddata transfer time (obtained thanks to performance models). The beta factor isthe coefficient to be applied to it before adding it to the computation part.@end table@node Misc@subsection Miscellaneous and debug@menu* STARPU_SILENT::               Disable verbose mode* STARPU_LOGFILENAME::          Select debug file name* STARPU_FXT_PREFIX::           FxT trace location* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down@end menu@node STARPU_SILENT@subsubsection @code{STARPU_SILENT} -- Disable verbose mode@table @asis@item @emph{Description}:This variable allows to disable verbose mode at runtime when StarPUhas been configured with the option @code{--enable-verbose}.@end table@node STARPU_LOGFILENAME@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name@table @asis@item @emph{Description}:This variable specifies in which file the debugging output should be saved to.@end table@node STARPU_FXT_PREFIX@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location@table @asis@item @emph{Description}This variable specifies in which directory to save the trace generated if FxT is enabled.@end table@node STARPU_LIMIT_GPU_MEM@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs@table @asis@item @emph{Description}This variable specifies the maximum number of megabytes that should beavailable to the application on each GPUs. In case this value is smaller thanthe size of the memory of a GPU, StarPU pre-allocates a buffer to waste memoryon the device. This variable is intended to be used for experimental purposesas it emulates devices that have a limited amount of memory.@end table@node STARPU_GENERATE_TRACE@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down@table @asis@item @emph{Description}When set to 1, this variable indicates that StarPU should automaticallygenerate a Paje trace when starpu_shutdown is called.@end table@c ---------------------------------------------------------------------@c StarPU API@c ---------------------------------------------------------------------@node StarPU API@chapter StarPU API@menu* Initialization and Termination::  Initialization and Termination methods* Workers' Properties::         Methods to enumerate workers' properties* Data Library::                Methods to manipulate data* Data Interfaces::             * Data Partition::              * Codelets and Tasks::          Methods to construct tasks* Explicit Dependencies::       Explicit Dependencies* Implicit Data Dependencies::  Implicit Data Dependencies* Performance Model API::       * Profiling API::               Profiling API* CUDA extensions::             CUDA extensions* OpenCL extensions::           OpenCL extensions* Cell extensions::             Cell extensions* Miscellaneous helpers::       @end menu@node Initialization and Termination@section Initialization and Termination@menu* starpu_init::                 Initialize StarPU* struct starpu_conf::          StarPU runtime configuration* starpu_conf_init::     Initialize starpu_conf structure* starpu_shutdown::             Terminate StarPU@end menu@node starpu_init@subsection @code{starpu_init} -- Initialize StarPU@table @asis@item @emph{Description}:This is StarPU initialization method, which must be called prior to any otherStarPU call.  It is possible to specify StarPU's configuration (e.g. schedulingpolicy, number of cores, ...) by passing a non-null argument. Defaultconfiguration is used if the passed argument is @code{NULL}.@item @emph{Return value}:Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}indicates that no worker was available (so that StarPU was not initialized).@item @emph{Prototype}:@code{int starpu_init(struct starpu_conf *conf);}@end table@node struct starpu_conf@subsection @code{struct starpu_conf} -- StarPU runtime configuration@table @asis@item @emph{Description}:This structure is passed to the @code{starpu_init} function in orderto configure StarPU.When the default value is used, StarPU automatically selects the numberof processing units and takes the default scheduling policy. This parameteroverwrites the equivalent environment variables.@item @emph{Fields}:@table @asis@item @code{sched_policy_name} (default = NULL):This is the name of the scheduling policy. This can also be specified with the@code{STARPU_SCHED} environment variable.@item @code{sched_policy} (default = NULL):This is the definition of the scheduling policy. This field is ignoredif @code{sched_policy_name} is set.@item @code{ncpus} (default = -1):This is the number of CPU cores that StarPU can use. This can also bespecified with the @code{STARPU_NCPUS} environment variable.@item @code{ncuda} (default = -1):This is the number of CUDA devices that StarPU can use. This can also bespecified with the @code{STARPU_NCUDA} environment variable.@item @code{nopencl} (default = -1):This is the number of OpenCL devices that StarPU can use. This can also bespecified with the @code{STARPU_NOPENCL} environment variable.@item @code{nspus} (default = -1):This is the number of Cell SPUs that StarPU can use. This can also bespecified with the @code{STARPU_NGORDON} environment variable.@item @code{use_explicit_workers_bindid} (default = 0)If this flag is set, the @code{workers_bindid} array indicates where thedifferent workers are bound, otherwise StarPU automatically selects where tobind the different workers unless the @code{STARPU_WORKERS_CPUID} environmentvariable is set. The @code{STARPU_WORKERS_CPUID} environment variable isignored if the @code{use_explicit_workers_bindid} flag is set.@item @code{workers_bindid[STARPU_NMAXWORKERS]}If the @code{use_explicit_workers_bindid} flag is set, this array indicateswhere to bind the different workers. The i-th entry of the@code{workers_bindid} indicates the logical identifier of the processor whichshould execute the i-th worker. Note that the logical ordering of the CPUs iseither determined by the OS, or provided by the @code{hwloc} library in case itis available.When this flag is set, the @ref{STARPU_WORKERS_CPUID} environment variable isignored. @item @code{use_explicit_workers_cuda_gpuid} (default = 0)If this flag is set, the CUDA workers will be attached to the CUDA devicesspecified in the @code{workers_cuda_gpuid} array. Otherwise, StarPU affects theCUDA devices in a round-robin fashion.When this flag is set, the @ref{STARPU_WORKERS_CUDAID} environment variable isignored.@item @code{workers_cuda_gpuid[STARPU_NMAXWORKERS]}If the @code{use_explicit_workers_cuda_gpuid} flag is set, this array containsthe logical identifiers of the CUDA devices (as used by  @code{cudaGetDevice}).@item @code{use_explicit_workers_opencl_gpuid} (default = 0)If this flag is set, the OpenCL workers will be attached to the OpenCL devicesspecified in the @code{workers_opencl_gpuid} array. Otherwise, StarPU affects theOpenCL devices in a round-robin fashion.@item @code{workers_opencl_gpuid[STARPU_NMAXWORKERS]}:@item @code{calibrate} (default = 0):If this flag is set, StarPU will calibrate the performance models whenexecuting tasks. If this value is equal to -1, the default value is used. Thedefault value is overwritten by the @code{STARPU_CALIBRATE} environmentvariable when it is set.@end table@end table@node starpu_conf_init@subsection @code{starpu_conf_init} -- Initialize starpu_conf structure@table @asisThis function initializes the @code{starpu_conf} structure passed as argumentwith the default values. In case some configuration parameters are alreadyspecified through environment variables, @code{starpu_conf_init} initializesthe fields of the structure according to the environment variables. Forinstance if @code{STARPU_CALIBRATE} is set, its value is put in the@code{.ncuda} field of the structure passed as argument.@item @emph{Return value}:Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}indicates that the argument was NULL.@item @emph{Prototype}:@code{int starpu_conf_init(struct starpu_conf *conf);}@end table@node starpu_shutdown@subsection @code{starpu_shutdown} -- Terminate StarPU@deftypefun void starpu_shutdown (void)This is StarPU termination method. It must be called at the end of theapplication: statistics and other post-mortem debugging information are notguaranteed to be available until this method has been called.@end deftypefun@node Workers' Properties@section Workers' Properties@menu* starpu_worker_get_count::     Get the number of processing units* starpu_worker_get_count_by_type:: Get the number of processing units of a given type* starpu_cpu_worker_get_count::  Get the number of CPU controlled by StarPU* starpu_cuda_worker_get_count::  Get the number of CUDA devices controlled by StarPU* starpu_opencl_worker_get_count::  Get the number of OpenCL devices controlled by StarPU* starpu_spu_worker_get_count::  Get the number of Cell SPUs controlled by StarPU* starpu_worker_get_id::        Get the identifier of the current worker* starpu_worker_get_ids_by_type:: Get the list of identifiers of workers with a given type* starpu_worker_get_devid::        Get the device identifier of a worker* starpu_worker_get_type::      Get the type of processing unit associated to a worker* starpu_worker_get_name::      Get the name of a worker* starpu_worker_get_memory_node:: Get the memory node of a worker @end menu@node starpu_worker_get_count@subsection @code{starpu_worker_get_count} -- Get the number of processing units@deftypefun unsigned starpu_worker_get_count (void)This function returns the number of workers (i.e. processing units executingStarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.@end deftypefun@node starpu_worker_get_count_by_type@subsection @code{starpu_worker_get_count_by_type} -- Get the number of processing units of a given type@deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})Returns the number of workers of the type indicated by the argument. A positive(or null) value is returned in case of success, @code{-EINVAL} indicates thatthe type is not valid otherwise.@end deftypefun@node starpu_cpu_worker_get_count@subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU@deftypefun unsigned starpu_cpu_worker_get_count (void)This function returns the number of CPUs controlled by StarPU. The returnedvalue should be at most @code{STARPU_NMAXCPUS}.@end deftypefun@node starpu_cuda_worker_get_count@subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU@deftypefun unsigned starpu_cuda_worker_get_count (void)This function returns the number of CUDA devices controlled by StarPU. The returnedvalue should be at most @code{STARPU_MAXCUDADEVS}.@end deftypefun@node starpu_opencl_worker_get_count@subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU@deftypefun unsigned starpu_opencl_worker_get_count (void)This function returns the number of OpenCL devices controlled by StarPU. The returnedvalue should be at most @code{STARPU_MAXOPENCLDEVS}.@end deftypefun@node starpu_spu_worker_get_count@subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU@deftypefun unsigned starpu_opencl_worker_get_count (void)This function returns the number of Cell SPUs controlled by StarPU.@end deftypefun@node starpu_worker_get_id@subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker@deftypefun int starpu_worker_get_id (void)This function returns the identifier of the worker associated to the callingthread. The returned value is either -1 if the current context is not a StarPUworker (i.e. when called from the application outside a task or a callback), oran integer between 0 and @code{starpu_worker_get_count() - 1}.@end deftypefun@node starpu_worker_get_ids_by_type@subsection @code{starpu_worker_get_ids_by_type} -- Get the list of identifiers of workers with a given type@deftypefun int starpu_worker_get_ids_by_type ({enum starpu_archtype} @var{type}, int *@var{workerids}, int @var{maxsize})Fill the workerids array with the identifiers of the workers that have the typeindicated in the first argument. The maxsize argument indicates the size of theworkids array. The returned value gives the number of identifiers that were putin the array. @code{-ERANGE} is returned is maxsize is lower than the number ofworkers with the appropriate type: in that case, the array is filled with themaxsize first elements. To avoid such overflows, the value of maxsize can bechosen by the means of the @code{starpu_worker_get_count_by_type} function, orby passing a value greater or equal to @code{STARPU_NMAXWORKERS}.@end deftypefun@node starpu_worker_get_devid@subsection @code{starpu_worker_get_devid} -- Get the device identifier of a worker@deftypefun int starpu_worker_get_devid (int @var{id})This functions returns the device id of the worker associated to an identifier(as returned by the @code{starpu_worker_get_id} function). In the case of aCUDA worker, this device identifier is the logical device identifier exposed byCUDA (used by the @code{cudaGetDevice} function for instance). The deviceidentifier of a CPU worker is the logical identifier of the core on which theworker was bound; this identifier is either provided by the OS or by the@code{hwloc} library in case it is available.@end deftypefun@node starpu_worker_get_type@subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker@deftypefun {enum starpu_archtype} starpu_worker_get_type (int @var{id})This function returns the type of worker associated to an identifier (asreturned by the @code{starpu_worker_get_id} function). The returned valueindicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPUcore, @code{STARPU_CUDA_WORKER} for a CUDA device,@code{STARPU_OPENCL_WORKER} for a OpenCL device, and@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalididentifier is unspecified.@end deftypefun@node starpu_worker_get_name@subsection @code{starpu_worker_get_name} -- Get the name of a worker@deftypefun void starpu_worker_get_name (int @var{id}, char *@var{dst}, size_t @var{maxlen})StarPU associates a unique human readable string to each processing unit. Thisfunction copies at most the @code{maxlen} first bytes of the unique stringassociated to a worker identified by its identifier @code{id} into the@code{dst} buffer. The caller is responsible for ensuring that the @code{dst}is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling thisfunction on an invalid identifier results in an unspecified behaviour.@end deftypefun@node starpu_worker_get_memory_node@subsection @code{starpu_worker_get_memory_node} -- Get the memory node of a worker@deftypefun unsigned starpu_worker_get_memory_node (unsigned @var{workerid})This function returns the identifier of the memory node associated to theworker identified by @code{workerid}.@end deftypefun@node Data Library@section Data LibraryThis section describes the data management facilities provided by StarPU.We show how to use existing data interfaces in @ref{Data Interfaces}, but developers candesign their own data interfaces if required.@menu* starpu_malloc::          Allocate data and pin it* starpu_access_mode::          Data access mode* unsigned memory_node::        Memory node* starpu_data_handle::          StarPU opaque data handle* void *interface::             StarPU data interface* starpu_data_register::        Register a piece of data to StarPU* starpu_data_unregister::      Unregister a piece of data from StarPU* starpu_data_invalidate::      Invalidate all data replicates* starpu_data_acquire::         Access registered data from the application* starpu_data_acquire_cb::      Access registered data from the application asynchronously* starpu_data_release::         Release registered data from the application* starpu_data_set_wt_mask::     Set the Write-Through mask* starpu_data_prefetch_on_node:: Prefetch data to a given node@end menu@node starpu_malloc@subsection @code{starpu_malloc} -- Allocate data and pin it@deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})This function allocates data of the given size. It will also try to pin it inCUDA or OpenGL, so that data transfers from this buffer can be asynchronous, andthus permit data transfer and computation overlapping. The allocated buffer mustbe freed thanks to the @code{starpu_free} function.@end deftypefun@node starpu_access_mode@subsection @code{starpu_access_mode} -- Data access modeThis datatype describes a data access mode. The different available modes are:@table @asis@table @asis @item @code{STARPU_R} read-only mode.@item @code{STARPU_W} write-only mode.@item @code{STARPU_RW} read-write mode. This is equivalent to @code{STARPU_R|STARPU_W}.@item @code{STARPU_SCRATCH} scratch memory. A temporary buffer is allocated for the task, but StarPU does not enforce data consistency, i.e. each device has its own buffer, independently from each other (even for CPUs). This is useful for temporary variables. For now, no behaviour is defined concerning the relation with STARPU_R/W modes and the value provided at registration, i.e. the value of the scratch buffer is undefined at entry of the codelet function, but this is being considered for future extensions.@item @code{STARPU_REDUX} reduction mode. TODO: document, as well as @code{starpu_data_set_reduction_methods}@end table@end table@node unsigned memory_node@subsection @code{unsigned memory_node} -- Memory node@table @asis@item @emph{Description}:Every worker is associated to a memory node which is a logical abstraction ofthe address space from which the processing unit gets its data. For instance,the memory node associated to the different CPU workers represents main memory(RAM), the memory node associated to a GPU is DRAM embedded on the device.Every memory node is identified by a logical index which is accessible from the@code{starpu_worker_get_memory_node} function. When registering a piece of datato StarPU, the specified memory node indicates where the piece of datainitially resides (we also call this memory node the home node of a piece ofdata).@end table@node starpu_data_handle@subsection @code{starpu_data_handle} -- StarPU opaque data handle@table @asis@item @emph{Description}:StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece ofdata. Once a piece of data has been registered to StarPU, it is associated to a@code{starpu_data_handle} which keeps track of the state of the piece of dataover the entire machine, so that we can maintain data consistency and locatedata replicates for instance.@end table@node void *interface@subsection @code{void *interface} -- StarPU data interface@table @asis@item @emph{Description}:Data management is done at a high-level in StarPU: rather than accessing a merelist of contiguous buffers, the tasks may manipulate data that are described bya high-level construct which we call data interface.An example of data interface is the "vector" interface which describes acontiguous data array on a spefic memory node. This interface is a simplestructure containing the number of elements in the array, the size of theelements, and the address of the array in the appropriate address space (thisaddress may be invalid if there is no valid copy of the array in the memorynode). More informations on the data interfaces provided by StarPU aregiven in @ref{Data Interfaces}.When a piece of data managed by StarPU is used by a task, the taskimplementation is given a pointer to an interface describing a valid copy ofthe data that is accessible from the current processing unit.@end table@node starpu_data_register@subsection @code{starpu_data_register} -- Register a piece of data to StarPU@deftypefun void starpu_data_register (starpu_data_handle *@var{handleptr}, uint32_t @var{home_node}, void *@var{interface}, {struct starpu_data_interface_ops_t} *@var{ops})Register a piece of data into the handle located at the @code{handleptr}address. The @code{interface} buffer contains the initial description of thedata in the home node. The @code{ops} argument is a pointer to a structuredescribing the different methods used to manipulate this type of interface. See@ref{struct starpu_data_interface_ops_t} for more details on this structure.If @code{home_node} is -1, StarPU will automaticallyallocate the memory when it is used for thefirst time in write-only mode. Once such data handle has been automaticallyallocated, it is possible to access it using any access mode.Note that StarPU supplies a set of predefined types of interface (e.g. vector ormatrix) which can be registered by the means of helper functions (e.g.@code{starpu_vector_data_register} or @code{starpu_matrix_data_register}).@end deftypefun@node starpu_data_unregister@subsection @code{starpu_data_unregister} -- Unregister a piece of data from StarPU@deftypefun void starpu_data_unregister (starpu_data_handle @var{handle})This function unregisters a data handle from StarPU. If the data wasautomatically allocated by StarPU because the home node was -1, allautomatically allocated buffers are freed. Otherwise, a valid copy of the datais put back into the home node in the buffer that was initially registered.Using a data handle that has been unregistered from StarPU results in anundefined behaviour.@end deftypefun@node starpu_data_invalidate@subsection @code{starpu_data_invalidate} -- Invalidate all data replicates@deftypefun void starpu_data_invalidate (starpu_data_handle @var{handle})Destroy all replicates of the data handle. After data invalidation, the firstaccess to the handle must be performed in write-only mode. Accessing aninvalidated data in read-mode results in undefined behaviour.@end deftypefun@c TODO create a specific sections about user interaction with the DSM ?@node starpu_data_acquire@subsection @code{starpu_data_acquire} -- Access registered data from the application@deftypefun int starpu_data_acquire (starpu_data_handle @var{handle}, starpu_access_mode @var{mode})The application must call this function prior to accessing registered data frommain memory outside tasks. StarPU ensures that the application will get anup-to-date copy of the data in main memory located where the data wasoriginally registered, and that all concurrent accesses (e.g. from tasks) willbe consistent with the access mode specified in the @code{mode} argument.@code{starpu_data_release} must be called once the application does not need toaccess the piece of data anymore.  Note that implicit datadependencies are also enforced by @code{starpu_data_acquire}, i.e.@code{starpu_data_acquire} will wait for all tasks scheduled to work onthe data, unless that they have not been disabled explictly by calling@code{starpu_data_set_default_sequential_consistency_flag} or@code{starpu_data_set_sequential_consistency_flag}.@code{starpu_data_acquire} is a blocking call, so that it cannot be called fromtasks or from their callbacks (in that case, @code{starpu_data_acquire} returns@code{-EDEADLK}). Upon successful completion, this function returns 0. @end deftypefun@node starpu_data_acquire_cb@subsection @code{starpu_data_acquire_cb} -- Access registered data from the application asynchronously@deftypefun int starpu_data_acquire_cb (starpu_data_handle @var{handle}, starpu_access_mode @var{mode}, void (*@var{callback})(void *), void *@var{arg})@code{starpu_data_acquire_cb} is the asynchronous equivalent of@code{starpu_data_release}. When the data specified in the first argument isavailable in the appropriate access mode, the callback function is executed.The application may access the requested data during the execution of thiscallback. The callback function must call @code{starpu_data_release} once theapplication does not need to access the piece of data anymore. Note that implicit data dependencies are also enforced by@code{starpu_data_acquire_cb} in case they are enabled. Contrary to @code{starpu_data_acquire}, this function is non-blocking and maybe called from task callbacks. Upon successful completion, this functionreturns 0.@end deftypefun@node starpu_data_release@subsection @code{starpu_data_release} -- Release registered data from the application@deftypefun void starpu_data_release (starpu_data_handle @var{handle})This function releases the piece of data acquired by the application either by@code{starpu_data_acquire} or by @code{starpu_data_acquire_cb}.@end deftypefun@node starpu_data_set_wt_mask@subsection @code{starpu_data_set_wt_mask} -- Set the Write-Through mask@deftypefun void starpu_data_set_wt_mask (starpu_data_handle @var{handle}, uint32_t @var{wt_mask})This function sets the write-through mask of a given data, i.e. a bitmask ofnodes where the data should be always replicated after modification.@end deftypefun@node starpu_data_prefetch_on_node@subsection @code{starpu_data_prefetch_on_node} -- Prefetch data to a given node@deftypefun int starpu_data_prefetch_on_node (starpu_data_handle @var{handle}, unsigned @var{node}, unsigned @var{async})Issue a prefetch request for a given data to a given node, i.e.requests that the data be replicated to the given node, so that it is availablethere for tasks. If the @code{async} parameter is 0, the call will block untilthe transfer is achieved, else the call will return as soon as the request isscheduled (which may however have to wait for a task completion).@end deftypefun@node Data Interfaces@section Data Interfaces@menu* Variable Interface::          * Vector Interface::            * Matrix Interface::            * 3D Matrix Interface::             * BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)::  * CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)::  @end menu@node Variable Interface@subsection Variable Interface@table @asis@item @emph{Description}:This variant of @code{starpu_data_register} uses the variable interface,i.e. for a mere single variable. @code{ptr} is the address of the variable,and @code{elemsize} is the size of the variable.@item @emph{Prototype}:@code{void starpu_variable_data_register(starpu_data_handle *handle,                                   uint32_t home_node,                                   uintptr_t ptr, size_t elemsize);}@item @emph{Example}:@cartouche@smallexamplefloat var;starpu_data_handle var_handle;starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));@end smallexample@end cartouche@end table@node Vector Interface@subsection Vector Interface@table @asis@item @emph{Description}:This variant of @code{starpu_data_register} uses the vector interface,i.e. for mere arrays of elements. @code{ptr} is the address of the firstelement in the home node. @code{nx} is the number of elements in the vector.@code{elemsize} is the size of each element.@item @emph{Prototype}:@code{void starpu_vector_data_register(starpu_data_handle *handle, uint32_t home_node,                        uintptr_t ptr, uint32_t nx, size_t elemsize);}@item @emph{Example}:@cartouche@smallexamplefloat vector[NX];starpu_data_handle vector_handle;starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,                            sizeof(vector[0]));@end smallexample@end cartouche@end table@node Matrix Interface@subsection Matrix Interface@table @asis@item @emph{Description}:This variant of @code{starpu_data_register} uses the matrix interface, i.e. formatrices of elements. @code{ptr} is the address of the first element in the homenode. @code{ld} is the number of elements between rows. @code{nx} is the numberof elements in a row (this can be different from @code{ld} if there are extraelements for alignment for instance). @code{ny} is the number of rows.@code{elemsize} is the size of each element.@item @emph{Prototype}:@code{void starpu_matrix_data_register(starpu_data_handle *handle, uint32_t home_node,                                       uintptr_t ptr, uint32_t ld, uint32_t nx,                                       uint32_t ny, size_t elemsize);}@item @emph{Example}:@cartouche@smallexamplefloat *matrix;starpu_data_handle matrix_handle;matrix = (float*)malloc(width * height * sizeof(float));starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,                            width, width, height, sizeof(float));@end smallexample@end cartouche@end table@node 3D Matrix Interface@subsection 3D Matrix Interface@table @asis@item @emph{Description}:This variant of @code{starpu_data_register} uses the 3D matrix interface.@code{ptr} is the address of the array of first element in the home node.@code{ldy} is the number of elements between rows. @code{ldz} is the numberof rows between z planes. @code{nx} is the number of elements in a row (thiscan be different from @code{ldy} if there are extra elements for alignmentfor instance). @code{ny} is the number of rows in a z plane (likewise with@code{ldz}). @code{nz} is the number of z planes. @code{elemsize} is the size ofeach element.@item @emph{Prototype}:@code{void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,                        uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,                        uint32_t ny, uint32_t nz, size_t elemsize);}@item @emph{Example}:@cartouche@smallexamplefloat *block;starpu_data_handle block_handle;block = (float*)malloc(nx*ny*nz*sizeof(float));starpu_block_data_register(&block_handle, 0, (uintptr_t)block,                           nx, nx*ny, nx, ny, nz, sizeof(float));@end smallexample@end cartouche@end table@node BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)@subsection BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)@deftypefun void starpu_bcsr_data_register (starpu_data_handle *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})This variant of @code{starpu_data_register} uses the BCSR sparse matrix interface.TODO@end deftypefun@node CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)@subsection CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)@deftypefun void starpu_csr_data_register (starpu_data_handle *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, size_t @var{elemsize})This variant of @code{starpu_data_register} uses the CSR sparse matrix interface.TODO@end deftypefun@node Data Partition@section Data Partition@menu* struct starpu_data_filter::   StarPU filter structure* starpu_data_partition::       Partition Data* starpu_data_unpartition::     Unpartition Data* starpu_data_get_nb_children::  * starpu_data_get_sub_data::    * Predefined filter functions::  @end menu@node struct starpu_data_filter@subsection @code{struct starpu_data_filter} -- StarPU filter structure@table @asis@item @emph{Description}:The filter structure describes a data partitioning operation, to be given to the@code{starpu_data_partition} function, see @ref{starpu_data_partition} for an example.@item @emph{Fields}:@table @asis@item @code{filter_func}:This function fills the @code{child_interface} structure with interfaceinformation for the @code{id}-th child of the parent @code{father_interface} (among @code{nparts}).@code{void (*filter_func)(void *father_interface, void* child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts);}@item @code{nchildren}:This is the number of parts to partition the data into.@item @code{get_nchildren}:This returns the number of children. This can be used instead of @code{nchildren} when the number ofchildren depends on the actual data (e.g. the number of blocks in a sparsematrix).@code{unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle initial_handle);}@item @code{get_child_ops}:In case the resulting children use a different data interface, this functionreturns which interface is used by child number @code{id}.@code{struct starpu_data_interface_ops_t *(*get_child_ops)(struct starpu_data_filter *, unsigned id);}@item @code{filter_arg}:Some filters take an addition parameter, but this is usually unused.@item @code{filter_arg_ptr}:Some filters take an additional array parameter like the sizes of the parts, butthis is usually unused.@end table@end table@node starpu_data_partition@subsection starpu_data_partition -- Partition Data@table @asis@item @emph{Description}:This requests partitioning one StarPU data @code{initial_handle} into severalsubdata according to the filter @code{f}@item @emph{Prototype}:@code{void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f);}@item @emph{Example}:@cartouche@smallexamplestruct starpu_data_filter f = @{    .filter_func = starpu_vertical_block_filter_func,    .nchildren = nslicesx,    .get_nchildren = NULL,    .get_child_ops = NULL@};starpu_data_partition(A_handle, &f);@end smallexample@end cartouche@end table@node starpu_data_unpartition@subsection starpu_data_unpartition -- Unpartition data@table @asis@item @emph{Description}:This unapplies one filter, thus unpartitioning the data. The pieces of data arecollected back into one big piece in the @code{gathering_node} (usually 0).@item @emph{Prototype}:@code{void starpu_data_unpartition(starpu_data_handle root_data, uint32_t gathering_node);}@item @emph{Example}:@cartouche@smallexamplestarpu_data_unpartition(A_handle, 0);@end smallexample@end cartouche@end table@node starpu_data_get_nb_children@subsection starpu_data_get_nb_children@table @asis@item @emph{Description}:This function returns the number of children.@item @emph{Return value}:The number of children.@item @emph{Prototype}:@code{int starpu_data_get_nb_children(starpu_data_handle handle);}@end table@c starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i);@node starpu_data_get_sub_data@subsection starpu_data_get_sub_data@table @asis@item @emph{Description}:After partitioning a StarPU data by applying a filter,@code{starpu_data_get_sub_data} can be used to get handles for each of the dataportions. @code{root_data} is the parent data that was partitioned. @code{depth}is the number of filters to traverse (in case several filters have been applied,to e.g. partition in row blocks, and then in column blocks), and the subsequentparameters are the indexes.@item @emph{Return value}:A handle to the subdata.@item @emph{Prototype}:@code{starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_data, unsigned depth, ... );}@item @emph{Example}:@cartouche@smallexampleh = starpu_data_get_sub_data(A_handle, 1, taskx);@end smallexample@end cartouche@end table@node Predefined filter functions@subsection Predefined filter functions@menu* Partitioning BCSR Data::      * Partitioning BLAS interface::  * Partitioning Vector Data::    * Partitioning Block Data::     @end menuThis section gives a partial list of the predefined partitioning functions.Examples on how to use them are shown in @ref{Partitioning Data}. The completelist can be found in @code{starpu_data_filters.h} .@node Partitioning BCSR Data@subsubsection Partitioning BCSR Data@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})TODO@end deftypefun@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})TODO@end deftypefun@node Partitioning BLAS interface@subsubsection Partitioning BLAS interface@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a dense Matrix into horizontal blocks.@end deftypefun@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a dense Matrix into vertical blocks.@end deftypefun@node Partitioning Vector Data@subsubsection Partitioning Vector Data@deftypefun void starpu_block_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a vector into blocks of the same size.@end deftypefun@deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a vector into blocks of sizes given in @code{filter_arg_ptr}.@end deftypefun@deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a vector into two blocks, the first block size being given in @code{filter_arg}.@end deftypefun@node Partitioning Block Data@subsubsection Partitioning Block Data@deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})This partitions a 3D matrix along the X axis.@end deftypefun@node Codelets and Tasks@section Codelets and TasksThis section describes the interface to manipulate codelets and tasks.@deftp {Data Type} {struct starpu_codelet}The codelet structure describes a kernel that is possibly implemented on varioustargets. For compatibility, make sure to initialize the whole structure to zero.@table @asis@item @code{where}Indicates which types of processing units are able to execute the codelet.@code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet isimplemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}indicates that it is only available on Cell SPUs.@item @code{cpu_func} (optional)Is a function pointer to the CPU implementation of the codelet. Its prototypemust be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The firstargument being the array of data managed by the data management library, andthe second argument is a pointer to the argument passed from the @code{cl_arg}field of the @code{starpu_task} structure.The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear inthe @code{where} field, it must be non-null otherwise.@item @code{cuda_func} (optional)Is a function pointer to the CUDA implementation of the codelet. @emph{Thismust be a host-function written in the CUDA runtime API}. Its prototype mustbe: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}field, it must be non-null otherwise.@item @code{opencl_func} (optional)Is a function pointer to the OpenCL implementation of the codelet. Itsprototype must be:@code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.This pointer is ignored if @code{STARPU_OPENCL} does not appear in the@code{where} field, it must be non-null otherwise.@item @code{gordon_func} (optional)This is the index of the Cell SPU implementation within the Gordon library.See Gordon documentation for more details on how to register a kernel andretrieve its index.@item @code{nbuffers}Specifies the number of arguments taken by the codelet. These arguments aremanaged by the DSM and are accessed from the @code{void *buffers[]}array. The constant argument passed with the @code{cl_arg} field of the@code{starpu_task} structure is not counted in this number.  This value shouldnot be above @code{STARPU_NMAXBUFS}.@item @code{model} (optional)This is a pointer to the task duration performance model associated to thiscodelet. This optional field is ignored when set to @code{NULL}.TODO@item @code{power_model} (optional)This is a pointer to the task power consumption performance model associatedto this codelet. This optional field is ignored when set to @code{NULL}.In the case of parallel codelets, this has to account for all processing unitsinvolved in the parallel execution.TODO@end table@end deftp@deftp {Data Type} {struct starpu_task}The @code{starpu_task} structure describes a task that can be offloaded on the variousprocessing units managed by StarPU. It instantiates a codelet. It can either beallocated dynamically with the @code{starpu_task_create} method, or declaredstatically. In the latter case, the programmer has to zero the@code{starpu_task} structure and to fill the different fields properly. Theindicated default values correspond to the configuration of a task allocatedwith @code{starpu_task_create}.@table @asis@item @code{cl}Is a pointer to the corresponding @code{starpu_codelet} data structure. Thisdescribes where the kernel should be executed, and supplies the appropriateimplementations. When set to @code{NULL}, no code is executed during the tasks,such empty tasks can be useful for synchronization purposes.@item @code{buffers}Is an array of @code{starpu_buffer_descr_t} structures. It describes thedifferent pieces of data accessed by the task, and how they should be accessed.The @code{starpu_buffer_descr_t} structure is composed of two fields, the@code{handle} field specifies the handle of the piece of data, and the@code{mode} field is the required access mode (eg @code{STARPU_RW}). The numberof entries in this array must be specified in the @code{nbuffers} field of the@code{starpu_codelet} structure, and should not excede @code{STARPU_NMAXBUFS}.If unsufficient, this value can be set with the @code{--enable-maxbuffers}option when configuring StarPU.@item @code{cl_arg} (optional; default: @code{NULL})This pointer is passed to the codelet through the second argumentof the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).In the specific case of the Cell processor, see the @code{cl_arg_size}argument.@item @code{cl_arg_size} (optional, Cell-specific)In the case of the Cell processor, the @code{cl_arg} pointer is not directlygiven to the SPU function. A buffer of size @code{cl_arg_size} is allocated onthe SPU. This buffer is then filled with the @code{cl_arg_size} bytes startingat address @code{cl_arg}. In this case, the argument given to the SPU codeletis therefore not the @code{cl_arg} pointer, but the address of the buffer inlocal store (LS) instead. This field is ignored for CPU, CUDA and OpenCLcodelets, where the @code{cl_arg} pointer is given as such.@item @code{callback_func} (optional) (default: @code{NULL})This is a function pointer of prototype @code{void (*f)(void *)} whichspecifies a possible callback. If this pointer is non-null, the callbackfunction is executed @emph{on the host} after the execution of the task. Thecallback is passed the value contained in the @code{callback_arg} field. Nocallback is executed if the field is set to @code{NULL}.@item @code{callback_arg} (optional) (default: @code{NULL})This is the pointer passed to the callback function. This field is ignored ifthe @code{callback_func} is set to @code{NULL}.@item @code{use_tag} (optional) (default: @code{0})If set, this flag indicates that the task should be associated with the tagcontained in the @code{tag_id} field. Tag allow the application to synchronizewith the task and to express task dependencies easily.@item @code{tag_id}This fields contains the tag associated to the task if the @code{use_tag} fieldwas set, it is ignored otherwise.@item @code{synchronous}If this flag is set, the @code{starpu_task_submit} function is blocking andreturns only when the task has been executed (or if no worker is able toprocess the task). Otherwise, @code{starpu_task_submit} returns immediately.@item @code{priority} (optional) (default: @code{STARPU_DEFAULT_PRIO})This field indicates a level of priority for the task. This is an integer valuethat must be set between the return values of the@code{starpu_sched_get_min_priority} function for the least important tasks,and that of the @code{starpu_sched_get_max_priority} for the most importanttasks (included). The @code{STARPU_MIN_PRIO} and @code{STARPU_MAX_PRIO} macrosare provided for convenience and respectively returns value of@code{starpu_sched_get_min_priority} and @code{starpu_sched_get_max_priority}.Default priority is @code{STARPU_DEFAULT_PRIO}, which is always defined as 0 inorder to allow static task initialization.  Scheduling strategies that takepriorities into account can use this parameter to take better schedulingdecisions, but the scheduling policy may also ignore it.@item @code{execute_on_a_specific_worker} (default: @code{0})If this flag is set, StarPU will bypass the scheduler and directly affect thistask to the worker specified by the @code{workerid} field.@item @code{workerid} (optional)If the @code{execute_on_a_specific_worker} field is set, this field indicateswhich is the identifier of the worker that should process this task (asreturned by @code{starpu_worker_get_id}). This field is ignored if@code{execute_on_a_specific_worker} field is set to 0.@item @code{detach} (optional) (default: @code{1})If this flag is set, it is not possible to synchronize with the taskby the means of @code{starpu_task_wait} later on. Internal data structuresare only guaranteed to be freed once @code{starpu_task_wait} is called if theflag is not set.@item @code{destroy} (optional) (default: @code{1})If this flag is set, the task structure will automatically be freed, eitherafter the execution of the callback if the task is detached, or during@code{starpu_task_wait} otherwise. If this flag is not set, dynamicallyallocated data structures will not be freed until @code{starpu_task_destroy} iscalled explicitly. Setting this flag for a statically allocated task structurewill result in undefined behaviour.@item @code{predicted} (output field)Predicted duration of the task. This field is only set if the schedulingstrategy used performance models.@end table@end deftp@deftypefun void starpu_task_init ({struct starpu_task} *@var{task})Initialize @var{task} with default values. This function is implicitlycalled by @code{starpu_task_create}. By default, tasks initialized with@code{starpu_task_init} must be deinitialized explicitly with@code{starpu_task_deinit}. Tasks can also be initialized statically, using theconstant @code{STARPU_TASK_INITIALIZER}.@end deftypefun@deftypefun {struct starpu_task *} starpu_task_create (void)Allocate a task structure and initialize it with default values. Tasksallocated dynamically with @code{starpu_task_create} are automatically freed when thetask is terminated. If the destroy flag is explicitly unset, the resources usedby the task are freed by calling@code{starpu_task_destroy}.@end deftypefun@deftypefun void starpu_task_deinit ({struct starpu_task} *@var{task})Release all the structures automatically allocated to execute @var{task}. This iscalled automatically by @code{starpu_task_destroy}, but the task structure itself is notfreed. This should be used for statically allocated tasks for instance.@end deftypefun@deftypefun void starpu_task_destroy ({struct starpu_task} *@var{task})Free the resource allocated during @code{starpu_task_create} andassociated with @var{task}. This function can be called automaticallyafter the execution of a task by setting the @code{destroy} flag of the@code{starpu_task} structure (default behaviour).  Calling this functionon a statically allocated task results in an undefined behaviour.@end deftypefun@deftypefun int starpu_task_wait ({struct starpu_task} *@var{task})This function blocks until @var{task} has been executed. It is not possible tosynchronize with a task more than once. It is not possible to wait forsynchronous or detached tasks.Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}indicates that the specified task was either synchronous or detached.@end deftypefun@deftypefun int starpu_task_submit ({struct starpu_task} *@var{task})This function submits @var{task} to StarPU. Calling this function doesnot mean that the task will be executed immediately as there can be data or task(tag) dependencies that are not fulfilled yet: StarPU will take care ofscheduling this task with respect to such dependencies.This function returns immediately if the @code{synchronous} field of the@code{starpu_task} structure was set to 0, and block until the termination ofthe task otherwise. It is also possible to synchronize the application withasynchronous tasks by the means of tags, using the @code{starpu_tag_wait}function for instance.In case of success, this function returns 0, a return value of @code{-ENODEV}means that there is no worker able to process this task (e.g. there is no GPUavailable and this task is only implemented for CUDA devices).@end deftypefun@deftypefun int starpu_task_wait_for_all (void)This function blocks until all the tasks that were submitted are terminated.@end deftypefun@deftypefun {struct starpu_task *} starpu_get_current_task (void)This function returns the task currently executed by the worker, orNULL if it is called either from a thread that is not a task or simplybecause there is no task being executed at the moment.@end deftypefun@deftypefun void starpu_display_codelet_stats ({struct starpu_codelet_t} *@var{cl})Output on @code{stderr} some statistics on the codelet @var{cl}.@end deftypefun@c Callbacks : what can we put in callbacks ?@node Explicit Dependencies@section Explicit Dependencies@menu* starpu_task_declare_deps_array::        starpu_task_declare_deps_array* starpu_tag_t::                Task logical identifier* starpu_tag_declare_deps::     Declare the Dependencies of a Tag* starpu_tag_declare_deps_array::  Declare the Dependencies of a Tag* starpu_tag_wait::             Block until a Tag is terminated* starpu_tag_wait_array::       Block until a set of Tags is terminated* starpu_tag_remove::           Destroy a Tag* starpu_tag_notify_from_apps::  Feed a tag explicitly@end menu@node starpu_task_declare_deps_array@subsection @code{starpu_task_declare_deps_array} -- Declare task dependencies@deftypefun void starpu_task_declare_deps_array ({struct starpu_task} *@var{task}, unsigned @var{ndeps}, {struct starpu_task} *@var{task_array[]})Declare task dependencies between a @code{task} and an array of tasks of length@code{ndeps}. This function must be called prior to the submission of the task,but it may called after the submission or the execution of the tasks in thearray provided the tasks are still valid (ie. they were not automaticallydestroyed). Calling this function on a task that was already submitted or withan entry of @code{task_array} that is not a valid task anymore results in anundefined behaviour. If @code{ndeps} is null, no dependency is added. It ispossible to call @code{starpu_task_declare_deps_array} multiple times on thesame task, in this case, the dependencies are added. It is possible to haveredundancy in the task dependencies.@end deftypefun@node starpu_tag_t@subsection @code{starpu_tag_t} -- Task logical identifier@table @asis@item @emph{Description}:It is possible to associate a task with a unique ``tag'' chosen by the application, and to expressdependencies between tasks by the means of those tags. To do so, fill the@code{tag_id} field of the @code{starpu_task} structure with a tag number (canbe arbitrary) and set the @code{use_tag} field to 1.If @code{starpu_tag_declare_deps} is called with this tag number, the task willnot be started until the tasks which holds the declared dependency tags arecompleted.@end table@node starpu_tag_declare_deps@subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag@table @asis@item @emph{Description}:Specify the dependencies of the task identified by tag @code{id}. The firstargument specifies the tag which is configured, the second argument gives thenumber of tag(s) on which @code{id} depends. The following arguments are thetags which have to be terminated to unlock the task.This function must be called before the associated task is submitted to StarPUwith @code{starpu_task_submit}.@item @emph{Remark}Because of the variable arity of @code{starpu_tag_declare_deps}, note that thelast arguments @emph{must} be of type @code{starpu_tag_t}: constant valuestypically need to be explicitly casted. Using the@code{starpu_tag_declare_deps_array} function avoids this hazard.@item @emph{Prototype}:@code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}@item @emph{Example}:@cartouche@example/*  Tag 0x1 depends on tags 0x32 and 0x52 */starpu_tag_declare_deps((starpu_tag_t)0x1,        2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);@end example@end cartouche@end table@node starpu_tag_declare_deps_array@subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag@table @asis@item @emph{Description}:This function is similar to @code{starpu_tag_declare_deps}, except that itsdoes not take a variable number of arguments but an array of tags of size@code{ndeps}.@item @emph{Prototype}:@code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}@item @emph{Example}:@cartouche@example/*  Tag 0x1 depends on tags 0x32 and 0x52 */starpu_tag_t tag_array[2] = @{0x32, 0x52@};starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);@end example@end cartouche@end table@node starpu_tag_wait@subsection @code{starpu_tag_wait} -- Block until a Tag is terminated@deftypefun void starpu_tag_wait (starpu_tag_t @var{id})This function blocks until the task associated to tag @code{id} has beenexecuted. This is a blocking call which must therefore not be called withintasks or callbacks, but only from the application directly.  It is possible tosynchronize with the same tag multiple times, as long as the@code{starpu_tag_remove} function is not called.  Note that it is stillpossible to synchronize with a tag associated to a task which @code{starpu_task}data structure was freed (e.g. if the @code{destroy} flag of the@code{starpu_task} was enabled).@end deftypefun@node starpu_tag_wait_array@subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated@deftypefun void starpu_tag_wait_array (unsigned @var{ntags}, starpu_tag_t *@var{id})This function is similar to @code{starpu_tag_wait} except that it blocks until@emph{all} the @code{ntags} tags contained in the @code{id} array areterminated.@end deftypefun@node starpu_tag_remove@subsection @code{starpu_tag_remove} -- Destroy a Tag@deftypefun void starpu_tag_remove (starpu_tag_t @var{id})This function releases the resources associated to tag @code{id}. It can becalled once the corresponding task has been executed and when there isno other tag that depend on this tag anymore.@end deftypefun@node starpu_tag_notify_from_apps@subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly@deftypefun void starpu_tag_notify_from_apps (starpu_tag_t @var{id})This function explicitly unlocks tag @code{id}. It may be useful in thecase of applications which execute part of their computation outside StarPUtasks (e.g. third-party libraries).  It is also provided as aconvenient tool for the programmer, for instance to entirely construct the taskDAG before actually giving StarPU the opportunity to execute the tasks.@end deftypefun@node Implicit Data Dependencies@section Implicit Data Dependencies@menu* starpu_data_set_default_sequential_consistency_flag::        starpu_data_set_default_sequential_consistency_flag* starpu_data_get_default_sequential_consistency_flag::        starpu_data_get_default_sequential_consistency_flag* starpu_data_set_sequential_consistency_flag::                starpu_data_set_sequential_consistency_flag@end menuIn this section, we describe how StarPU makes it possible to insert implicittask dependencies in order to enforce sequential data consistency. When thisdata consistency is enabled on a specific data handle, any data access willappear as sequentially consistent from the application. For instance, if theapplication submits two tasks that access the same piece of data in read-onlymode, and then a third task that access it in write mode, dependencies will beadded between the two first tasks and the third one. Implicit data dependenciesare also inserted in the case of data accesses from the application.@node starpu_data_set_default_sequential_consistency_flag@subsection @code{starpu_data_set_default_sequential_consistency_flag} -- Set default sequential consistency flag@deftypefun void starpu_data_set_default_sequential_consistency_flag (unsigned @var{flag})Set the default sequential consistency flag. If a non-zero value is passed, asequential data consistency will be enforced for all handles registered afterthis function call, otherwise it is disabled. By default, StarPU enablessequential data consistency. It is also possible to select the data consistencymode of a specific data handle with the@code{starpu_data_set_sequential_consistency_flag} function.@end deftypefun@node starpu_data_get_default_sequential_consistency_flag@subsection @code{starpu_data_get_default_sequential_consistency_flag} -- Get current default sequential consistency flag@deftypefun unsigned starpu_data_set_default_sequential_consistency_flag (void)This function returns the current default sequential consistency flag.@end deftypefun@node starpu_data_set_sequential_consistency_flag@subsection @code{starpu_data_set_sequential_consistency_flag} -- Set data sequential consistency mode@deftypefun void starpu_data_set_sequential_consistency_flag (starpu_data_handle @var{handle}, unsigned @var{flag})Select the data consistency mode associated to a data handle. The consistencymode set using this function has the priority over the default mode which canbe set with @code{starpu_data_set_sequential_consistency_flag}.@end deftypefun@node Performance Model API@section Performance Model API@menu* starpu_load_history_debug::   * starpu_perfmodel_debugfilepath::  * starpu_perfmodel_get_arch_name::  * starpu_force_bus_sampling::   @end menu@node starpu_load_history_debug@subsection @code{starpu_load_history_debug}@deftypefun int starpu_load_history_debug ({const char} *@var{symbol}, {struct starpu_perfmodel_t} *@var{model})TODO@end deftypefun@node starpu_perfmodel_debugfilepath@subsection @code{starpu_perfmodel_debugfilepath}@deftypefun void starpu_perfmodel_debugfilepath ({struct starpu_perfmodel_t} *@var{model}, {enum starpu_perf_archtype} @var{arch}, char *@var{path}, size_t @var{maxlen})TODO@end deftypefun@node starpu_perfmodel_get_arch_name@subsection @code{starpu_perfmodel_get_arch_name}@deftypefun void starpu_perfmodel_get_arch_name ({enum starpu_perf_archtype} @var{arch}, char *@var{archname}, size_t @var{maxlen})TODO@end deftypefun@node starpu_force_bus_sampling@subsection @code{starpu_force_bus_sampling}@deftypefun void starpu_force_bus_sampling (void)This forces sampling the bus performance model again.@end deftypefun@node Profiling API@section Profiling API@menu* starpu_profiling_status_set::  starpu_profiling_status_set* starpu_profiling_status_get::  starpu_profiling_status_get* struct starpu_task_profiling_info::  task profiling information* struct starpu_worker_profiling_info::  worker profiling information* starpu_worker_get_profiling_info::  starpu_worker_get_profiling_info* struct starpu_bus_profiling_info::  bus profiling information* starpu_bus_get_count::        * starpu_bus_get_id::           * starpu_bus_get_src::          * starpu_bus_get_dst::          * starpu_timing_timespec_delay_us::  * starpu_timing_timespec_to_us::  * starpu_bus_profiling_helper_display_summary::  * starpu_worker_profiling_helper_display_summary::  @end menu@node starpu_profiling_status_set@subsection @code{starpu_profiling_status_set} -- Set current profiling status@table @asis@item @emph{Description}:Thie function sets the profiling status. Profiling is activated by passing@code{STARPU_PROFILING_ENABLE} in @code{status}. Passing@code{STARPU_PROFILING_DISABLE} disables profiling. Calling this functionresets all profiling measurements. When profiling is enabled, the@code{profiling_info} field of the @code{struct starpu_task} structure pointsto a valid @code{struct starpu_task_profiling_info} structure containinginformation about the execution of the task.@item @emph{Return value}:Negative return values indicate an error, otherwise the previous status isreturned.@item @emph{Prototype}:@code{int starpu_profiling_status_set(int status);}@end table@node starpu_profiling_status_get@subsection @code{starpu_profiling_status_get} -- Get current profiling status@deftypefun int starpu_profiling_status_get (void)Return the current profiling status or a negative value in case there was an error.@end deftypefun@node struct starpu_task_profiling_info@subsection @code{struct starpu_task_profiling_info} -- Task profiling information@table @asis@item @emph{Description}:This structure contains information about the execution of a task. It isaccessible from the @code{.profiling_info} field of the @code{starpu_task}structure if profiling was enabled.@item @emph{Fields}:@table @asis@item @code{submit_time}:Date of task submission (relative to the initialization of StarPU).@item @code{start_time}:Date of task execution beginning (relative to the initialization of StarPU).@item @code{end_time}:Date of task execution termination (relative to the initialization of StarPU).@item @code{workerid}:Identifier of the worker which has executed the task.@end table@end table@node struct starpu_worker_profiling_info@subsection @code{struct starpu_worker_profiling_info} -- Worker profiling information@table @asis@item @emph{Description}:This structure contains the profiling information associated to a worker.@item @emph{Fields}:@table @asis@item @code{start_time}:Starting date for the reported profiling measurements.@item @code{total_time}:Duration of the profiling measurement interval.@item @code{executing_time}:Time spent by the worker to execute tasks during the profiling measurement interval.@item @code{sleeping_time}:Time spent idling by the worker during the profiling measurement interval.@item @code{executed_tasks}:Number of tasks executed by the worker during the profiling measurement interval.@end table@end table@node starpu_worker_get_profiling_info@subsection @code{starpu_worker_get_profiling_info} -- Get worker profiling info@table @asis@item @emph{Description}:Get the profiling info associated to the worker identified by @code{workerid},and reset the profiling measurements. If the @code{worker_info} argument isNULL, only reset the counters associated to worker @code{workerid}.@item @emph{Return value}:Upon successful completion, this function returns 0. Otherwise, a negativevalue is returned.@item @emph{Prototype}:@code{int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);}@end table@node struct starpu_bus_profiling_info@subsection @code{struct starpu_bus_profiling_info} -- Bus profiling information@table @asis@item @emph{Description}:TODO@item @emph{Fields}:@table @asis@item @code{start_time}:TODO@item @code{total_time}:TODO@item @code{transferred_bytes}:TODO@item @code{transfer_count}:TODO@end table@end table@node starpu_bus_get_count@subsection @code{starpu_bus_get_count}@deftypefun int starpu_bus_get_count (void)TODO@end deftypefun@node starpu_bus_get_id@subsection @code{starpu_bus_get_id}@deftypefun int starpu_bus_get_id (int @var{src}, int @var{dst})TODO@end deftypefun@node starpu_bus_get_src@subsection @code{starpu_bus_get_src}@deftypefun int starpu_bus_get_src (int @var{busid})TODO@end deftypefun@node starpu_bus_get_dst@subsection @code{starpu_bus_get_dst}@deftypefun int starpu_bus_get_dst (int @var{busid})TODO@end deftypefun@node starpu_timing_timespec_delay_us@subsection @code{starpu_timing_timespec_delay_us}@deftypefun double starpu_timing_timespec_delay_us ({struct timespec} *@var{start}, {struct timespec} *@var{end})TODO@end deftypefun@node starpu_timing_timespec_to_us@subsection @code{starpu_timing_timespec_to_us}@deftypefun double starpu_timing_timespec_to_us ({struct timespec} *@var{ts})TODO@end deftypefun@node starpu_bus_profiling_helper_display_summary@subsection @code{starpu_bus_profiling_helper_display_summary}@deftypefun void starpu_bus_profiling_helper_display_summary (void)TODO@end deftypefun@node starpu_worker_profiling_helper_display_summary@subsection @code{starpu_worker_profiling_helper_display_summary}@deftypefun void starpu_worker_profiling_helper_display_summary (void)TODO@end deftypefun@node CUDA extensions@section CUDA extensions@c void starpu_malloc(float **A, size_t dim);@menu* starpu_cuda_get_local_stream::  Get current worker's CUDA stream* starpu_helper_cublas_init::   Initialize CUBLAS on every CUDA device* starpu_helper_cublas_shutdown::  Deinitialize CUBLAS on every CUDA device@end menu@node starpu_cuda_get_local_stream@subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream@deftypefun {cudaStream_t *} starpu_cuda_get_local_stream (void)StarPU provides a stream for every CUDA device controlled by StarPU. Thisfunction is only provided for convenience so that programmers can easily useasynchronous operations within codelets without having to create a stream byhand. Note that the application is not forced to use the stream provided by@code{starpu_cuda_get_local_stream} and may also create its own streams.Synchronizing with @code{cudaThreadSynchronize()} is allowed, but will reducethe likelihood of having all transfers overlapped.@end deftypefun@node starpu_helper_cublas_init@subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device@deftypefun void starpu_helper_cublas_init (void)The CUBLAS library must be initialized prior to any CUBLAS call. Calling@code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA devicecontrolled by StarPU. This call blocks until CUBLAS has been properlyinitialized on every device.@end deftypefun@node starpu_helper_cublas_shutdown@subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device@deftypefun void starpu_helper_cublas_shutdown (void)This function synchronously deinitializes the CUBLAS library on every CUDA device.@end deftypefun@node OpenCL extensions@section OpenCL extensions@menu* Enabling OpenCL::            Enabling OpenCL* Compiling OpenCL kernels::   Compiling OpenCL kernels* Loading OpenCL kernels::     Loading OpenCL kernels* OpenCL statistics::          Collecting statistics from OpenCL@end menu@node Enabling OpenCL@subsection Enabling OpenCLOn GPU devices which can run both CUDA and OpenCL, CUDA will beenabled by default. To enable OpenCL, you need either to disable CUDAwhen configuring StarPU:@example% ./configure --disable-cuda@end exampleor when running applications:@example% STARPU_NCUDA=0 ./application@end exampleOpenCL will automatically be started on any device not yet used byCUDA. So on a machine running 4 GPUS, it is therefore possible toenable CUDA on 2 devices, and OpenCL on the 2 other devices by doingso:@example% STARPU_NCUDA=2 ./application@end example@node Compiling OpenCL kernels@subsection Compiling OpenCL kernelsSource codes for OpenCL kernels can be stored in a file or in astring. StarPU provides functions to build the program executable foreach available OpenCL device as a @code{cl_program} object. Thisprogram executable can then be loaded within a specific queue asexplained in the next section. These are only helpers, Applicationscan also fill a @code{starpu_opencl_program} array by hand for more advanceduse (e.g. different programs on the different OpenCL devices, forrelocation purpose for instance).@menu* starpu_opencl_load_opencl_from_file::  Compiling OpenCL source code* starpu_opencl_load_opencl_from_string::  Compiling OpenCL source code* starpu_opencl_unload_opencl::  Releasing OpenCL code@end menu@node starpu_opencl_load_opencl_from_file@subsubsection @code{starpu_opencl_load_opencl_from_file} -- Compiling OpenCL source code@deftypefun int starpu_opencl_load_opencl_from_file (char *@var{source_file_name}, {struct starpu_opencl_program} *@var{opencl_programs}, {const char}* @var{build_options})TODO@end deftypefun@node starpu_opencl_load_opencl_from_string@subsubsection @code{starpu_opencl_load_opencl_from_string} -- Compiling OpenCL source code@deftypefun int starpu_opencl_load_opencl_from_string (char *@var{opencl_program_source}, {struct starpu_opencl_program} *@var{opencl_programs}, {const char}* @var{build_options})TODO@end deftypefun@node starpu_opencl_unload_opencl@subsubsection @code{starpu_opencl_unload_opencl} -- Releasing OpenCL code@deftypefun int starpu_opencl_unload_opencl ({struct starpu_opencl_program} *@var{opencl_programs})TODO@end deftypefun@node Loading OpenCL kernels@subsection Loading OpenCL kernels@menu* starpu_opencl_load_kernel::   Loading a kernel* starpu_opencl_relase_kernel::  Releasing a kernel@end menu@node starpu_opencl_load_kernel@subsubsection @code{starpu_opencl_load_kernel} -- Loading a kernel@deftypefun int starpu_opencl_load_kernel (cl_kernel *@var{kernel}, cl_command_queue *@var{queue}, {struct starpu_opencl_program} *@var{opencl_programs}, char *@var{kernel_name}, int @var{devid})TODO@end deftypefun@node starpu_opencl_relase_kernel@subsubsection @code{starpu_opencl_release_kernel} -- Releasing a kernel@deftypefun int starpu_opencl_release_kernel (cl_kernel @var{kernel})TODO@end deftypefun@node OpenCL statistics@subsection OpenCL statistics@menu* starpu_opencl_collect_stats::   Collect statistics on a kernel execution@end menu@node starpu_opencl_collect_stats@subsubsection @code{starpu_opencl_collect_stats} -- Collect statistics on a kernel execution@deftypefun int starpu_opencl_collect_stats (cl_event @var{event})After termination of the kernels, the OpenCL codelet should call this functionto pass it the even returned by @code{clEnqueueNDRangeKernel}, to let StarPUcollect statistics about the kernel execution (used cycles, consumed power).@end deftypefun@node Cell extensions@section Cell extensionsnothing yet.@node Miscellaneous helpers@section Miscellaneous helpers@menu* starpu_data_cpy::                Copy a data handle into another data handle* starpu_execute_on_each_worker::  Execute a function on a subset of workers@end menu@node starpu_data_cpy@subsection @code{starpu_data_cpy} -- Copy a data handle into another data handle@deftypefun int starpu_data_cpy (starpu_data_handle @var{dst_handle}, starpu_data_handle @var{src_handle}, int @var{asynchronous}, void (*@var{callback_func})(void*), void *@var{callback_arg})Copy the content of the @code{src_handle} into the @code{dst_handle} handle.The @code{asynchronous} parameter indicates whether the function should block or not. In the case of an asynchronous call, it is possible tosynchronize with the termination of this operation either by the means ofimplicit dependencies (if enabled) or by calling@code{starpu_task_wait_for_all()}. If @code{callback_func} is not @code{NULL},this callback function is executed after the handle has been copied, and it isgiven the @code{callback_arg} pointer as argument.@end deftypefun@node starpu_execute_on_each_worker@subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers@deftypefun void starpu_execute_on_each_worker (void (*@var{func})(void *), void *@var{arg}, uint32_t @var{where})When calling this method, the offloaded function specified by the first argument isexecuted by every StarPU worker that may execute the function.The second argument is passed to the offloaded function.The last argument specifies on which types of processing units the functionshould be executed. Similarly to the @code{where} field of the@code{starpu_codelet} structure, it is possible to specify that the functionshould be executed on every CUDA device and every CPU by passing@code{STARPU_CPU|STARPU_CUDA}.This function blocks until the function has been executed on every appropriateprocessing units, so that it may not be called from a callback function forinstance.@end deftypefun@c ---------------------------------------------------------------------@c Advanced Topics@c ---------------------------------------------------------------------@node Advanced Topics@chapter Advanced Topics@menu* Defining a new data interface::* Defining a new scheduling policy::@end menu@node Defining a new data interface@section Defining a new data interface@menu* struct starpu_data_interface_ops_t::  Per-interface methods* struct starpu_data_copy_methods::	Per-interface data transfer methods* An example of data interface::        An example of data interface@end menu@c void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node); TODO@node struct starpu_data_interface_ops_t@subsection @code{struct starpu_data_interface_ops_t} -- Per-interface methods@table @asis@item @emph{Description}:TODO describe all the different fields@end table@node struct starpu_data_copy_methods@subsection @code{struct starpu_data_copy_methods} -- Per-interface data transfer methods@table @asis@item @emph{Description}:TODO describe all the different fields@end table@node An example of data interface@subsection An example of data interface@table @asisTODO@end table@node Defining a new scheduling policy@section Defining a new scheduling policyTODOA full example showing how to define a new scheduling policy is available inthe StarPU sources in the directory @code{examples/scheduler/}.@menu* struct starpu_sched_policy_s::  * starpu_worker_set_sched_condition::* starpu_sched_set_min_priority::       Set the minimum priority level* starpu_sched_set_max_priority::       Set the maximum priority level* starpu_push_local_task::		Assign a task to a worker* Source code::                 @end menu@node struct starpu_sched_policy_s@subsection @code{struct starpu_sched_policy_s} -- Scheduler methods@table @asis@item @emph{Description}:This structure contains all the methods that implement a scheduling policy.  Anapplication may specify which scheduling strategy in the @code{sched_policy}field of the @code{starpu_conf} structure passed to the @code{starpu_init}function.@item @emph{Fields}:@table @asis@item @code{init_sched}:Initialize the scheduling policy.@item @code{deinit_sched}:Cleanup the scheduling policy.@item @code{push_task}:Insert a task into the scheduler.@item @code{push_prio_task}:Insert a priority task into the scheduler.@item @code{push_prio_notify}:Notify the scheduler that a task was pushed on the worker. This method iscalled when a task that was explicitely assigned to a worker is scheduled. Thismethod therefore permits to keep the state of of the scheduler coherent evenwhen StarPU bypasses the scheduling strategy.@item @code{pop_task}:Get a task from the scheduler. The mutex associated to the worker is alreadytaken when this method is called. If this method is defined as @code{NULL}, theworker will only execute tasks from its local queue. In this case, the@code{push_task} method should use the @code{starpu_push_local_task} method toassign tasks to the different workers.@item @code{pop_every_task}:Remove all available tasks from the scheduler (tasks are chained by the meansof the prev and next fields of the starpu_task structure). The mutex associatedto the worker is already taken when this method is called. @item @code{post_exec_hook} (optionnal):This method is called every time a task has been executed.@item @code{policy_name}:Name of the policy (optionnal).@item @code{policy_description}:Description of the policy (optionnal).@end table@end table@node starpu_worker_set_sched_condition@subsection @code{starpu_worker_set_sched_condition} -- Specify the condition variable associated to a worker@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})When there is no available task for a worker, StarPU blocks this worker on acondition variable. This function specifies which condition variable (and theassociated mutex) should be used to block (and to wake up) a worker. Note thatmultiple workers may use the same condition variable. For instance, in the caseof a scheduling strategy with a single task queue, the same condition variablewould be used to block and wake up all workers.The initialization method of a scheduling strategy (@code{init_sched}) mustcall this function once per worker.@end deftypefun@node starpu_sched_set_min_priority@subsection @code{starpu_sched_set_min_priority}@deftypefun void starpu_sched_set_min_priority (int @var{min_prio})Defines the minimum priority level supported by the scheduling policy. Thedefault minimum priority level is the same as the default priority level whichis 0 by convention.  The application may access that value by calling the@code{starpu_sched_get_min_priority} function. This function should only becalled from the initialization method of the scheduling policy, and should notbe used directly from the application.@end deftypefun@node starpu_sched_set_max_priority@subsection @code{starpu_sched_set_max_priority}@deftypefun void starpu_sched_set_min_priority (int @var{max_prio})Defines the maximum priority level supported by the scheduling policy. Thedefault maximum priority level is 1.  The application may access that value bycalling the @code{starpu_sched_get_max_priority} function. This function shouldonly be called from the initialization method of the scheduling policy, andshould not be used directly from the application.@end deftypefun@node starpu_push_local_task@subsection @code{starpu_push_local_task}@deftypefun int starpu_push_local_task (int @var{workerid}, {struct starpu_task} *@var{task}, int @var{back})The scheduling policy may put tasks directly into a worker's local queue sothat it is not always necessary to create its own queue when the local queueis sufficient. If "back" not null, the task is put at the back of the queuewhere the worker will pop tasks first. Setting "back" to 0 therefore ensuresa FIFO ordering. @end deftypefun@node Source code@subsection Source code@cartouche@smallexamplestatic struct starpu_sched_policy_s dummy_sched_policy = @{    .init_sched = init_dummy_sched,    .deinit_sched = deinit_dummy_sched,    .push_task = push_task_dummy,    .push_prio_task = NULL,    .pop_task = pop_task_dummy,    .post_exec_hook = NULL,    .pop_every_task = NULL,    .policy_name = "dummy",    .policy_description = "dummy scheduling strategy"@};@end smallexample@end cartouche@c ---------------------------------------------------------------------@c Appendices@c ---------------------------------------------------------------------@c ---------------------------------------------------------------------@c Full source code for the 'Scaling a Vector' example@c ---------------------------------------------------------------------@node Full source code for the 'Scaling a Vector' example@appendix Full source code for the 'Scaling a Vector' example@menu* Main application::            * CPU Kernel::                 * CUDA Kernel::                * OpenCL Kernel::              @end menu@node Main application@section Main application@smallexample@include vector_scal_c.texi@end smallexample@node CPU Kernel@section CPU Kernel@smallexample@include vector_scal_cpu.texi@end smallexample@node CUDA Kernel@section CUDA Kernel@smallexample@include vector_scal_cuda.texi@end smallexample@node OpenCL Kernel@section OpenCL Kernel@menu* Invoking the kernel::         * Source of the kernel::        @end menu@node Invoking the kernel@subsection Invoking the kernel@smallexample@include vector_scal_opencl.texi@end smallexample@node Source of the kernel@subsection Source of the kernel@smallexample@include vector_scal_opencl_codelet.texi@end smallexample@c@c Indices.@c@node Function Index@unnumbered Function Index@printindex fn@bye
 |