starpu.texi 193 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU Handbook
  5. @c %**end of header
  6. @include version.texi
  7. @copying
  8. Copyright @copyright{} 2009--2011 Universit@'e de Bordeaux 1
  9. @noindent
  10. Copyright @copyright{} 2010, 2011 Centre National de la Recherche Scientifique
  11. @noindent
  12. Copyright @copyright{} 2011 Institut National de Recherche en Informatique et Automatique
  13. This manual documents StarPU @value{VERSION}.
  14. @quotation
  15. Permission is granted to copy, distribute and/or modify this document
  16. under the terms of the GNU Free Documentation License, Version 1.3
  17. or any later version published by the Free Software Foundation;
  18. with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
  19. Texts. A copy of the license is included in the section entitled ``GNU
  20. Free Documentation License''.
  21. @end quotation
  22. @end copying
  23. @setchapternewpage odd
  24. @dircategory Development
  25. @direntry
  26. * StarPU: (starpu). StarPU Handbook
  27. @end direntry
  28. @titlepage
  29. @title StarPU Handbook
  30. @subtitle for StarPU @value{VERSION}
  31. @page
  32. @vskip 0pt plus 1fill
  33. @insertcopying
  34. @end titlepage
  35. @c @summarycontents
  36. @contents
  37. @page
  38. @node Top
  39. @top Preface
  40. This manual documents the usage of StarPU version @value{VERSION}. It
  41. was last updated on @value{UPDATED}.
  42. @comment
  43. @comment When you add a new menu item, please keep the right hand
  44. @comment aligned to the same column. Do not use tabs. This provides
  45. @comment better formatting.
  46. @comment
  47. @menu
  48. * Introduction:: A basic introduction to using StarPU
  49. * Installing StarPU:: How to configure, build and install StarPU
  50. * Using StarPU:: How to run StarPU application
  51. * Basic Examples:: Basic examples of the use of StarPU
  52. * Performance optimization:: How to optimize performance with StarPU
  53. * Performance feedback:: Performance debugging tools
  54. * StarPU MPI support:: How to combine StarPU with MPI
  55. * Configuring StarPU:: How to configure StarPU
  56. * StarPU API:: The API to use StarPU
  57. * Advanced Topics:: Advanced use of StarPU
  58. * C Extensions:: Easier StarPU programming with GCC
  59. * Full source code for the 'Scaling a Vector' example::
  60. * Function Index:: Index of C functions.
  61. * GNU Free Documentation License:: How you can copy and share this manual.
  62. @end menu
  63. @c ---------------------------------------------------------------------
  64. @c Introduction to StarPU
  65. @c ---------------------------------------------------------------------
  66. @node Introduction
  67. @chapter Introduction to StarPU
  68. @menu
  69. * Motivation:: Why StarPU ?
  70. * StarPU in a Nutshell:: The Fundamentals of StarPU
  71. @end menu
  72. @node Motivation
  73. @section Motivation
  74. @c complex machines with heterogeneous cores/devices
  75. The use of specialized hardware such as accelerators or coprocessors offers an
  76. interesting approach to overcome the physical limits encountered by processor
  77. architects. As a result, many machines are now equipped with one or several
  78. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  79. efforts have been devoted to offload computation onto such accelerators, very
  80. little attention as been paid to portability concerns on the one hand, and to the
  81. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  82. StarPU is a runtime system that offers support for heterogeneous multicore
  83. architectures, it not only offers a unified view of the computational resources
  84. (i.e. CPUs and accelerators at the same time), but it also takes care of
  85. efficiently mapping and executing tasks onto an heterogeneous machine while
  86. transparently handling low-level issues such as data transfers in a portable
  87. fashion.
  88. @c this leads to a complicated distributed memory design
  89. @c which is not (easily) manageable by hand
  90. @c added value/benefits of StarPU
  91. @c - portability
  92. @c - scheduling, perf. portability
  93. @node StarPU in a Nutshell
  94. @section StarPU in a Nutshell
  95. @menu
  96. * Codelet and Tasks::
  97. * StarPU Data Management Library::
  98. * Glossary::
  99. * Research Papers::
  100. @end menu
  101. From a programming point of view, StarPU is not a new language but a library
  102. that executes tasks explicitly submitted by the application. The data that a
  103. task manipulates are automatically transferred onto the accelerator so that the
  104. programmer does not have to take care of complex data movements. StarPU also
  105. takes particular care of scheduling those tasks efficiently and allows
  106. scheduling experts to implement custom scheduling policies in a portable
  107. fashion.
  108. @c explain the notion of codelet and task (i.e. g(A, B)
  109. @node Codelet and Tasks
  110. @subsection Codelet and Tasks
  111. One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
  112. computational kernel that can possibly be implemented on multiple architectures
  113. such as a CPU, a CUDA device or a Cell's SPU.
  114. @c TODO insert illustration f : f_spu, f_cpu, ...
  115. Another important data structure is the @b{task}. Executing a StarPU task
  116. consists in applying a codelet on a data set, on one of the architectures on
  117. which the codelet is implemented. A task thus describes the codelet that it
  118. uses, but also which data are accessed, and how they are
  119. accessed during the computation (read and/or write).
  120. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  121. operation. The task structure can also specify a @b{callback} function that is
  122. called once StarPU has properly executed the task. It also contains optional
  123. fields that the application may use to give hints to the scheduler (such as
  124. priority levels).
  125. By default, task dependencies are inferred from data dependency (sequential
  126. coherence) by StarPU. The application can however disable sequential coherency
  127. for some data, and dependencies be expressed by hand.
  128. A task may be identified by a unique 64-bit number chosen by the application
  129. which we refer as a @b{tag}.
  130. Task dependencies can be enforced by hand either by the means of callback functions, by
  131. submitting other tasks, or by expressing dependencies
  132. between tags (which can thus correspond to tasks that have not been submitted
  133. yet).
  134. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  135. @c DSM
  136. @node StarPU Data Management Library
  137. @subsection StarPU Data Management Library
  138. Because StarPU schedules tasks at runtime, data transfers have to be
  139. done automatically and ``just-in-time'' between processing units,
  140. relieving the application programmer from explicit data transfers.
  141. Moreover, to avoid unnecessary transfers, StarPU keeps data
  142. where it was last needed, even if was modified there, and it
  143. allows multiple copies of the same data to reside at the same time on
  144. several processing units as long as it is not modified.
  145. @node Glossary
  146. @subsection Glossary
  147. A @b{codelet} records pointers to various implementations of the same
  148. theoretical function.
  149. A @b{memory node} can be either the main RAM or GPU-embedded memory.
  150. A @b{bus} is a link between memory nodes.
  151. A @b{data handle} keeps track of replicates of the same data (@b{registered} by the
  152. application) over various memory nodes. The data management library manages
  153. keeping them coherent.
  154. The @b{home} memory node of a data handle is the memory node from which the data
  155. was registered (usually the main memory node).
  156. A @b{task} represents a scheduled execution of a codelet on some data handles.
  157. A @b{tag} is a rendez-vous point. Tasks typically have their own tag, and can
  158. depend on other tags. The value is chosen by the application.
  159. A @b{worker} execute tasks. There is typically one per CPU computation core and
  160. one per accelerator (for which a whole CPU core is dedicated).
  161. A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
  162. OpenCL and Gordon drivers. They usually start several workers to actually drive
  163. them.
  164. A @b{performance model} is a (dynamic or static) model of the performance of a
  165. given codelet. Codelets can have execution time performance model as well as
  166. power consumption performance models.
  167. A data @b{interface} describes the layout of the data: for a vector, a pointer
  168. for the start, the number of elements and the size of elements ; for a matrix, a
  169. pointer for the start, the number of elements per row, the offset between rows,
  170. and the size of each element ; etc. To access their data, codelet functions are
  171. given interfaces for the local memory node replicates of the data handles of the
  172. scheduled task.
  173. @b{Partitioning} data means dividing the data of a given data handle (called
  174. @b{father}) into a series of @b{children} data handles which designate various
  175. portions of the former.
  176. A @b{filter} is the function which computes children data handles from a father
  177. data handle, and thus describes how the partitioning should be done (horizontal,
  178. vertical, etc.)
  179. @b{Acquiring} a data handle can be done from the main application, to safely
  180. access the data of a data handle from its home node, without having to
  181. unregister it.
  182. @node Research Papers
  183. @subsection Research Papers
  184. Research papers about StarPU can be found at
  185. @indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}
  186. Notably a good overview in the research report
  187. @indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}
  188. @c ---------------------------------------------------------------------
  189. @c Installing StarPU
  190. @c ---------------------------------------------------------------------
  191. @node Installing StarPU
  192. @chapter Installing StarPU
  193. @menu
  194. * Downloading StarPU::
  195. * Configuration of StarPU::
  196. * Building and Installing StarPU::
  197. @end menu
  198. StarPU can be built and installed by the standard means of the GNU
  199. autotools. The following chapter is intended to briefly remind how these tools
  200. can be used to install StarPU.
  201. @node Downloading StarPU
  202. @section Downloading StarPU
  203. @menu
  204. * Getting Sources::
  205. * Optional dependencies::
  206. @end menu
  207. @node Getting Sources
  208. @subsection Getting Sources
  209. The simplest way to get StarPU sources is to download the latest official
  210. release tarball from @indicateurl{https://gforge.inria.fr/frs/?group_id=1570} ,
  211. or the latest nightly snapshot from
  212. @indicateurl{http://starpu.gforge.inria.fr/testing/} . The following documents
  213. how to get the very latest version from the subversion repository itself, it
  214. should be needed only if you need the very latest changes (i.e. less than a
  215. day!)
  216. The source code is managed by a Subversion server hosted by the
  217. InriaGforge. To get the source code, you need:
  218. @itemize
  219. @item
  220. To install the client side of the software Subversion if it is
  221. not already available on your system. The software can be obtained from
  222. @indicateurl{http://subversion.tigris.org} . If you are running
  223. on Windows, you will probably prefer to use TortoiseSVN from
  224. @indicateurl{http://tortoisesvn.tigris.org/} .
  225. @item
  226. You can check out the project's SVN repository through anonymous
  227. access. This will provide you with a read access to the
  228. repository.
  229. If you need to have write access on the StarPU project, you can also choose to
  230. become a member of the project @code{starpu}. For this, you first need to get
  231. an account to the gForge server. You can then send a request to join the project
  232. (@indicateurl{https://gforge.inria.fr/project/request.php?group_id=1570}).
  233. @item
  234. More information on how to get a gForge account, to become a member of
  235. a project, or on any other related task can be obtained from the
  236. InriaGforge at @indicateurl{https://gforge.inria.fr/}. The most important
  237. thing is to upload your public SSH key on the gForge server (see the
  238. FAQ at @indicateurl{http://siteadmin.gforge.inria.fr/FAQ.html#Q6} for
  239. instructions).
  240. @end itemize
  241. You can now check out the latest version from the Subversion server:
  242. @itemize
  243. @item
  244. using the anonymous access via svn:
  245. @example
  246. % svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
  247. @end example
  248. @item
  249. using the anonymous access via https:
  250. @example
  251. % svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk
  252. @end example
  253. The password is @code{anonsvn}.
  254. @item
  255. using your gForge account
  256. @example
  257. % svn checkout svn+ssh://<login>@@scm.gforge.inria.fr/svn/starpu/trunk
  258. @end example
  259. @end itemize
  260. The following step requires the availability of @code{autoconf} and
  261. @code{automake} to generate the @code{./configure} script. This is
  262. done by calling @code{./autogen.sh}. The required version for
  263. @code{autoconf} is 2.60 or higher. You will also need @code{makeinfo}.
  264. @example
  265. % ./autogen.sh
  266. @end example
  267. If the autotools are not available on your machine or not recent
  268. enough, you can choose to download the latest nightly tarball, which
  269. is provided with a @code{configure} script.
  270. @example
  271. % wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
  272. @end example
  273. @node Optional dependencies
  274. @subsection Optional dependencies
  275. The topology discovery library, @code{hwloc}, is not mandatory to use StarPU
  276. but strongly recommended. It allows to increase performance, and to
  277. perform some topology aware scheduling.
  278. @code{hwloc} is available in major distributions and for most OSes and can be
  279. downloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.
  280. @node Configuration of StarPU
  281. @section Configuration of StarPU
  282. @menu
  283. * Generating Makefiles and configuration scripts::
  284. * Running the configuration::
  285. @end menu
  286. @node Generating Makefiles and configuration scripts
  287. @subsection Generating Makefiles and configuration scripts
  288. This step is not necessary when using the tarball releases of StarPU. If you
  289. are using the source code from the svn repository, you first need to generate
  290. the configure scripts and the Makefiles.
  291. @example
  292. % ./autogen.sh
  293. @end example
  294. @node Running the configuration
  295. @subsection Running the configuration
  296. @example
  297. % ./configure
  298. @end example
  299. Details about options that are useful to give to @code{./configure} are given in
  300. @ref{Compilation configuration}.
  301. @node Building and Installing StarPU
  302. @section Building and Installing StarPU
  303. @menu
  304. * Building::
  305. * Sanity Checks::
  306. * Installing::
  307. @end menu
  308. @node Building
  309. @subsection Building
  310. @example
  311. % make
  312. @end example
  313. @node Sanity Checks
  314. @subsection Sanity Checks
  315. In order to make sure that StarPU is working properly on the system, it is also
  316. possible to run a test suite.
  317. @example
  318. % make check
  319. @end example
  320. @node Installing
  321. @subsection Installing
  322. In order to install StarPU at the location that was specified during
  323. configuration:
  324. @example
  325. % make install
  326. @end example
  327. @c ---------------------------------------------------------------------
  328. @c Using StarPU
  329. @c ---------------------------------------------------------------------
  330. @node Using StarPU
  331. @chapter Using StarPU
  332. @menu
  333. * Setting flags for compiling and linking applications::
  334. * Running a basic StarPU application::
  335. * Kernel threads started by StarPU::
  336. * Enabling OpenCL::
  337. @end menu
  338. @node Setting flags for compiling and linking applications
  339. @section Setting flags for compiling and linking applications
  340. Compiling and linking an application against StarPU may require to use
  341. specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
  342. To this end, it is possible to use the @code{pkg-config} tool.
  343. If StarPU was not installed at some standard location, the path of StarPU's
  344. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  345. that @code{pkg-config} can find it. For example if StarPU was installed in
  346. @code{$prefix_dir}:
  347. @example
  348. % PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  349. @end example
  350. The flags required to compile or link against StarPU are then
  351. accessible with the following commands:
  352. @example
  353. % pkg-config --cflags libstarpu # options for the compiler
  354. % pkg-config --libs libstarpu # options for the linker
  355. @end example
  356. @node Running a basic StarPU application
  357. @section Running a basic StarPU application
  358. Basic examples using StarPU are built in the directory
  359. @code{examples/basic_examples/} (and installed in
  360. @code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
  361. @code{vector_scal}.
  362. @example
  363. % ./examples/basic_examples/vector_scal
  364. BEFORE : First element was 1.000000
  365. AFTER First element is 3.140000
  366. %
  367. @end example
  368. When StarPU is used for the first time, the directory
  369. @code{$HOME/.starpu/} is created, performance models will be stored in
  370. that directory.
  371. Please note that buses are benchmarked when StarPU is launched for the
  372. first time. This may take a few minutes, or less if @code{hwloc} is
  373. installed. This step is done only once per user and per machine.
  374. @node Kernel threads started by StarPU
  375. @section Kernel threads started by StarPU
  376. StarPU automatically binds one thread per CPU core. It does not use
  377. SMT/hyperthreading because kernels are usually already optimized for using a
  378. full core, and using hyperthreading would make kernel calibration rather random.
  379. Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
  380. While StarPU tasks are executing, the application is not supposed to do
  381. computations in the threads it starts itself, tasks should be used instead.
  382. TODO: add a StarPU function to bind an application thread (e.g. the main thread)
  383. to a dedicated core (and thus disable the corresponding StarPU CPU worker).
  384. @node Enabling OpenCL
  385. @section Enabling OpenCL
  386. When both CUDA and OpenCL drivers are enabled, StarPU will launch an
  387. OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
  388. This design choice was necessary as OpenCL and CUDA can not run at the
  389. same time on the same NVIDIA GPU, as there is currently no interoperability
  390. between them.
  391. To enable OpenCL, you need either to disable CUDA when configuring StarPU:
  392. @example
  393. % ./configure --disable-cuda
  394. @end example
  395. or when running applications:
  396. @example
  397. % STARPU_NCUDA=0 ./application
  398. @end example
  399. OpenCL will automatically be started on any device not yet used by
  400. CUDA. So on a machine running 4 GPUS, it is therefore possible to
  401. enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
  402. so:
  403. @example
  404. % STARPU_NCUDA=2 ./application
  405. @end example
  406. @c ---------------------------------------------------------------------
  407. @c Basic Examples
  408. @c ---------------------------------------------------------------------
  409. @node Basic Examples
  410. @chapter Basic Examples
  411. @menu
  412. * Compiling and linking options::
  413. * Hello World:: Submitting Tasks
  414. * Scaling a Vector:: Manipulating Data
  415. * Vector Scaling on an Hybrid CPU/GPU Machine:: Handling Heterogeneous Architectures
  416. * Using multiple implementations of a codelet::
  417. * Task and Worker Profiling::
  418. * Partitioning Data:: Partitioning Data
  419. * Performance model example::
  420. * Theoretical lower bound on execution time::
  421. * Insert Task Utility::
  422. * More examples:: More examples shipped with StarPU
  423. * Debugging:: When things go wrong.
  424. @end menu
  425. @node Compiling and linking options
  426. @section Compiling and linking options
  427. Let's suppose StarPU has been installed in the directory
  428. @code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
  429. the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
  430. necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
  431. libraries at runtime.
  432. @example
  433. % PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
  434. % LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
  435. @end example
  436. The Makefile could for instance contain the following lines to define which
  437. options must be given to the compiler and to the linker:
  438. @cartouche
  439. @example
  440. CFLAGS += $$(pkg-config --cflags libstarpu)
  441. LDFLAGS += $$(pkg-config --libs libstarpu)
  442. @end example
  443. @end cartouche
  444. @node Hello World
  445. @section Hello World
  446. @menu
  447. * Required Headers::
  448. * Defining a Codelet::
  449. * Submitting a Task::
  450. * Execution of Hello World::
  451. @end menu
  452. In this section, we show how to implement a simple program that submits a task to StarPU.
  453. @node Required Headers
  454. @subsection Required Headers
  455. The @code{starpu.h} header should be included in any code using StarPU.
  456. @cartouche
  457. @smallexample
  458. #include <starpu.h>
  459. @end smallexample
  460. @end cartouche
  461. @node Defining a Codelet
  462. @subsection Defining a Codelet
  463. @cartouche
  464. @smallexample
  465. struct params @{
  466. int i;
  467. float f;
  468. @};
  469. void cpu_func(void *buffers[], void *cl_arg)
  470. @{
  471. struct params *params = cl_arg;
  472. printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);
  473. @}
  474. starpu_codelet cl =
  475. @{
  476. .where = STARPU_CPU,
  477. .cpu_func = cpu_func,
  478. .nbuffers = 0
  479. @};
  480. @end smallexample
  481. @end cartouche
  482. A codelet is a structure that represents a computational kernel. Such a codelet
  483. may contain an implementation of the same kernel on different architectures
  484. (e.g. CUDA, Cell's SPU, x86, ...).
  485. The @code{nbuffers} field specifies the number of data buffers that are
  486. manipulated by the codelet: here the codelet does not access or modify any data
  487. that is controlled by our data management library. Note that the argument
  488. passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
  489. structure) does not count as a buffer since it is not managed by our data
  490. management library, but just contain trivial parameters.
  491. @c TODO need a crossref to the proper description of "where" see bla for more ...
  492. We create a codelet which may only be executed on the CPUs. The @code{where}
  493. field is a bitmask that defines where the codelet may be executed. Here, the
  494. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  495. (@pxref{Codelets and Tasks} for more details on this field).
  496. When a CPU core executes a codelet, it calls the @code{cpu_func} function,
  497. which @emph{must} have the following prototype:
  498. @code{void (*cpu_func)(void *buffers[], void *cl_arg);}
  499. In this example, we can ignore the first argument of this function which gives a
  500. description of the input and output buffers (e.g. the size and the location of
  501. the matrices) since there is none.
  502. The second argument is a pointer to a buffer passed as an
  503. argument to the codelet by the means of the @code{cl_arg} field of the
  504. @code{starpu_task} structure.
  505. @c TODO rewrite so that it is a little clearer ?
  506. Be aware that this may be a pointer to a
  507. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  508. if the codelet modifies this buffer, there is no guarantee that the initial
  509. buffer will be modified as well: this for instance implies that the buffer
  510. cannot be used as a synchronization medium. If synchronization is needed, data
  511. has to be registered to StarPU, see @ref{Scaling a Vector}.
  512. @node Submitting a Task
  513. @subsection Submitting a Task
  514. @cartouche
  515. @smallexample
  516. void callback_func(void *callback_arg)
  517. @{
  518. printf("Callback function (arg %x)\n", callback_arg);
  519. @}
  520. int main(int argc, char **argv)
  521. @{
  522. /* @b{initialize StarPU} */
  523. starpu_init(NULL);
  524. struct starpu_task *task = starpu_task_create();
  525. task->cl = &cl; /* @b{Pointer to the codelet defined above} */
  526. struct params params = @{ 1, 2.0f @};
  527. task->cl_arg = &params;
  528. task->cl_arg_size = sizeof(params);
  529. task->callback_func = callback_func;
  530. task->callback_arg = 0x42;
  531. /* @b{starpu_task_submit will be a blocking call} */
  532. task->synchronous = 1;
  533. /* @b{submit the task to StarPU} */
  534. starpu_task_submit(task);
  535. /* @b{terminate StarPU} */
  536. starpu_shutdown();
  537. return 0;
  538. @}
  539. @end smallexample
  540. @end cartouche
  541. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  542. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  543. be submitted after the termination of StarPU by a call to
  544. @code{starpu_shutdown}.
  545. In the example above, a task structure is allocated by a call to
  546. @code{starpu_task_create}. This function only allocates and fills the
  547. corresponding structure with the default settings (@pxref{Codelets and
  548. Tasks, starpu_task_create}), but it does not submit the task to StarPU.
  549. @c not really clear ;)
  550. The @code{cl} field is a pointer to the codelet which the task will
  551. execute: in other words, the codelet structure describes which computational
  552. kernel should be offloaded on the different architectures, and the task
  553. structure is a wrapper containing a codelet and the piece of data on which the
  554. codelet should operate.
  555. The optional @code{cl_arg} field is a pointer to a buffer (of size
  556. @code{cl_arg_size}) with some parameters for the kernel
  557. described by the codelet. For instance, if a codelet implements a computational
  558. kernel that multiplies its input vector by a constant, the constant could be
  559. specified by the means of this buffer, instead of registering it as a StarPU
  560. data. It must however be noted that StarPU avoids making copy whenever possible
  561. and rather passes the pointer as such, so the buffer which is pointed at must
  562. kept allocated until the task terminates, and if several tasks are submitted
  563. with various parameters, each of them must be given a pointer to their own
  564. buffer.
  565. Once a task has been executed, an optional callback function is be called.
  566. While the computational kernel could be offloaded on various architectures, the
  567. callback function is always executed on a CPU. The @code{callback_arg}
  568. pointer is passed as an argument of the callback. The prototype of a callback
  569. function must be:
  570. @code{void (*callback_function)(void *);}
  571. If the @code{synchronous} field is non-zero, task submission will be
  572. synchronous: the @code{starpu_task_submit} function will not return until the
  573. task was executed. Note that the @code{starpu_shutdown} method does not
  574. guarantee that asynchronous tasks have been executed before it returns,
  575. @code{starpu_task_wait_for_all} can be used to that effect, or data can be
  576. unregistered (@code{starpu_data_unregister(vector_handle);}), which will
  577. implicitly wait for all the tasks scheduled to work on it, unless explicitly
  578. disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
  579. @code{starpu_data_set_sequential_consistency_flag}.
  580. @node Execution of Hello World
  581. @subsection Execution of Hello World
  582. @smallexample
  583. % make hello_world
  584. cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) hello_world.c -o hello_world
  585. % ./hello_world
  586. Hello world (params = @{1, 2.000000@} )
  587. Callback function (arg 42)
  588. @end smallexample
  589. @node Scaling a Vector
  590. @section Manipulating Data: Scaling a Vector
  591. The previous example has shown how to submit tasks. In this section,
  592. we show how StarPU tasks can manipulate data. The full source code for
  593. this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
  594. @menu
  595. * Source code of Vector Scaling::
  596. * Execution of Vector Scaling::
  597. @end menu
  598. @node Source code of Vector Scaling
  599. @subsection Source code of Vector Scaling
  600. Programmers can describe the data layout of their application so that StarPU is
  601. responsible for enforcing data coherency and availability across the machine.
  602. Instead of handling complex (and non-portable) mechanisms to perform data
  603. movements, programmers only declare which piece of data is accessed and/or
  604. modified by a task, and StarPU makes sure that when a computational kernel
  605. starts somewhere (e.g. on a GPU), its data are available locally.
  606. Before submitting those tasks, the programmer first needs to declare the
  607. different pieces of data to StarPU using the @code{starpu_*_data_register}
  608. functions. To ease the development of applications for StarPU, it is possible
  609. to describe multiple types of data layout. A type of data layout is called an
  610. @b{interface}. There are different predefined interfaces available in StarPU:
  611. here we will consider the @b{vector interface}.
  612. The following lines show how to declare an array of @code{NX} elements of type
  613. @code{float} using the vector interface:
  614. @cartouche
  615. @smallexample
  616. float vector[NX];
  617. starpu_data_handle vector_handle;
  618. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
  619. sizeof(vector[0]));
  620. @end smallexample
  621. @end cartouche
  622. The first argument, called the @b{data handle}, is an opaque pointer which
  623. designates the array in StarPU. This is also the structure which is used to
  624. describe which data is used by a task. The second argument is the node number
  625. where the data originally resides. Here it is 0 since the @code{vector} array is in
  626. the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
  627. the number of elements in the vector and the size of each element.
  628. The following shows how to construct a StarPU task that will manipulate the
  629. vector and a constant factor.
  630. @cartouche
  631. @smallexample
  632. float factor = 3.14;
  633. struct starpu_task *task = starpu_task_create();
  634. task->cl = &cl; /* @b{Pointer to the codelet defined below} */
  635. task->buffers[0].handle = vector_handle; /* @b{First parameter of the codelet} */
  636. task->buffers[0].mode = STARPU_RW;
  637. task->cl_arg = &factor;
  638. task->cl_arg_size = sizeof(factor);
  639. task->synchronous = 1;
  640. starpu_task_submit(task);
  641. @end smallexample
  642. @end cartouche
  643. Since the factor is a mere constant float value parameter,
  644. it does not need a preliminary registration, and
  645. can just be passed through the @code{cl_arg} pointer like in the previous
  646. example. The vector parameter is described by its handle.
  647. There are two fields in each element of the @code{buffers} array.
  648. @code{handle} is the handle of the data, and @code{mode} specifies how the
  649. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  650. write-only and @code{STARPU_RW} for read and write access).
  651. The definition of the codelet can be written as follows:
  652. @cartouche
  653. @smallexample
  654. void scal_cpu_func(void *buffers[], void *cl_arg)
  655. @{
  656. unsigned i;
  657. float *factor = cl_arg;
  658. /* length of the vector */
  659. unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
  660. /* CPU copy of the vector pointer */
  661. float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
  662. for (i = 0; i < n; i++)
  663. val[i] *= *factor;
  664. @}
  665. starpu_codelet cl = @{
  666. .where = STARPU_CPU,
  667. .cpu_func = scal_cpu_func,
  668. .nbuffers = 1
  669. @};
  670. @end smallexample
  671. @end cartouche
  672. The first argument is an array that gives
  673. a description of all the buffers passed in the @code{task->buffers}@ array. The
  674. size of this array is given by the @code{nbuffers} field of the codelet
  675. structure. For the sake of genericity, this array contains pointers to the
  676. different interfaces describing each buffer. In the case of the @b{vector
  677. interface}, the location of the vector (resp. its length) is accessible in the
  678. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  679. read-write fashion, any modification will automatically affect future accesses
  680. to this vector made by other tasks.
  681. The second argument of the @code{scal_cpu_func} function contains a pointer to the
  682. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  683. constant factor from this pointer.
  684. @node Execution of Vector Scaling
  685. @subsection Execution of Vector Scaling
  686. @smallexample
  687. % make vector_scal
  688. cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) vector_scal.c -o vector_scal
  689. % ./vector_scal
  690. 0.000000 3.000000 6.000000 9.000000 12.000000
  691. @end smallexample
  692. @node Vector Scaling on an Hybrid CPU/GPU Machine
  693. @section Vector Scaling on an Hybrid CPU/GPU Machine
  694. Contrary to the previous examples, the task submitted in this example may not
  695. only be executed by the CPUs, but also by a CUDA device.
  696. @menu
  697. * Definition of the CUDA Kernel::
  698. * Definition of the OpenCL Kernel::
  699. * Definition of the Main Code::
  700. * Execution of Hybrid Vector Scaling::
  701. @end menu
  702. @node Definition of the CUDA Kernel
  703. @subsection Definition of the CUDA Kernel
  704. The CUDA implementation can be written as follows. It needs to be compiled with
  705. a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
  706. that the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPU
  707. memory, so that it can be passed as such to the @code{vector_mult_cuda} kernel
  708. call.
  709. @cartouche
  710. @smallexample
  711. #include <starpu.h>
  712. #include <starpu_cuda.h>
  713. static __global__ void vector_mult_cuda(float *val, unsigned n,
  714. float factor)
  715. @{
  716. unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
  717. if (i < n)
  718. val[i] *= factor;
  719. @}
  720. extern "C" void scal_cuda_func(void *buffers[], void *_args)
  721. @{
  722. float *factor = (float *)_args;
  723. /* length of the vector */
  724. unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
  725. /* CUDA copy of the vector pointer */
  726. float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
  727. unsigned threads_per_block = 64;
  728. unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
  729. @i{ vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
  730. @i{ cudaStreamSynchronize(starpu_cuda_get_local_stream());}
  731. @}
  732. @end smallexample
  733. @end cartouche
  734. @node Definition of the OpenCL Kernel
  735. @subsection Definition of the OpenCL Kernel
  736. The OpenCL implementation can be written as follows. StarPU provides
  737. tools to compile a OpenCL kernel stored in a file.
  738. @cartouche
  739. @smallexample
  740. __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
  741. @{
  742. const int i = get_global_id(0);
  743. if (i < nx) @{
  744. val[i] *= factor;
  745. @}
  746. @}
  747. @end smallexample
  748. @end cartouche
  749. Similarly to CUDA, the pointer returned by @code{STARPU_VECTOR_GET_PTR} is here
  750. a device pointer, so that it is passed as such to the OpenCL kernel.
  751. @cartouche
  752. @smallexample
  753. #include <starpu.h>
  754. @i{#include <starpu_opencl.h>}
  755. @i{extern struct starpu_opencl_program programs;}
  756. void scal_opencl_func(void *buffers[], void *_args)
  757. @{
  758. float *factor = _args;
  759. @i{ int id, devid, err;}
  760. @i{ cl_kernel kernel;}
  761. @i{ cl_command_queue queue;}
  762. @i{ cl_event event;}
  763. /* length of the vector */
  764. unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
  765. /* OpenCL copy of the vector pointer */
  766. cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
  767. @i{ id = starpu_worker_get_id();}
  768. @i{ devid = starpu_worker_get_devid(id);}
  769. @i{ err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
  770. @i{ "vector_mult_opencl", devid); /* @b{Name of the codelet defined above} */}
  771. @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
  772. @i{ err = clSetKernelArg(kernel, 0, sizeof(val), &val);}
  773. @i{ err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
  774. @i{ err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
  775. @i{ if (err) STARPU_OPENCL_REPORT_ERROR(err);}
  776. @i{ @{}
  777. @i{ size_t global=1;}
  778. @i{ size_t local=1;}
  779. @i{ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
  780. @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
  781. @i{ @}}
  782. @i{ clFinish(queue);}
  783. @i{ starpu_opencl_collect_stats(event);}
  784. @i{ clReleaseEvent(event);}
  785. @i{ starpu_opencl_release_kernel(kernel);}
  786. @}
  787. @end smallexample
  788. @end cartouche
  789. @node Definition of the Main Code
  790. @subsection Definition of the Main Code
  791. The CPU implementation is the same as in the previous section.
  792. Here is the source of the main application. You can notice the value of the
  793. field @code{where} for the codelet. We specify
  794. @code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
  795. can be executed either on a CPU or on a CUDA or an OpenCL device.
  796. @cartouche
  797. @smallexample
  798. #include <starpu.h>
  799. #define NX 2048
  800. extern void scal_cuda_func(void *buffers[], void *_args);
  801. extern void scal_cpu_func(void *buffers[], void *_args);
  802. extern void scal_opencl_func(void *buffers[], void *_args);
  803. /* @b{Definition of the codelet} */
  804. static starpu_codelet cl = @{
  805. .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
  806. /* @b{on a CUDA device, or on an OpenCL device} */
  807. .cuda_func = scal_cuda_func,
  808. .cpu_func = scal_cpu_func,
  809. .opencl_func = scal_opencl_func,
  810. .nbuffers = 1
  811. @}
  812. #ifdef STARPU_USE_OPENCL
  813. /* @b{The compiled version of the OpenCL program} */
  814. struct starpu_opencl_program programs;
  815. #endif
  816. int main(int argc, char **argv)
  817. @{
  818. float *vector;
  819. int i, ret;
  820. float factor=3.0;
  821. struct starpu_task *task;
  822. starpu_data_handle vector_handle;
  823. starpu_init(NULL); /* @b{Initialising StarPU} */
  824. #ifdef STARPU_USE_OPENCL
  825. starpu_opencl_load_opencl_from_file(
  826. "examples/basic_examples/vector_scal_opencl_codelet.cl",
  827. &programs, NULL);
  828. #endif
  829. vector = malloc(NX*sizeof(vector[0]));
  830. assert(vector);
  831. for(i=0 ; i<NX ; i++) vector[i] = i;
  832. @end smallexample
  833. @end cartouche
  834. @cartouche
  835. @smallexample
  836. /* @b{Registering data within StarPU} */
  837. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
  838. NX, sizeof(vector[0]));
  839. /* @b{Definition of the task} */
  840. task = starpu_task_create();
  841. task->cl = &cl;
  842. task->buffers[0].handle = vector_handle;
  843. task->buffers[0].mode = STARPU_RW;
  844. task->cl_arg = &factor;
  845. task->cl_arg_size = sizeof(factor);
  846. @end smallexample
  847. @end cartouche
  848. @cartouche
  849. @smallexample
  850. /* @b{Submitting the task} */
  851. ret = starpu_task_submit(task);
  852. if (ret == -ENODEV) @{
  853. fprintf(stderr, "No worker may execute this task\n");
  854. return 1;
  855. @}
  856. @c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
  857. /* @b{Waiting for its termination} */
  858. starpu_task_wait_for_all();
  859. /* @b{Update the vector in RAM} */
  860. starpu_data_acquire(vector_handle, STARPU_R);
  861. @end smallexample
  862. @end cartouche
  863. @cartouche
  864. @smallexample
  865. /* @b{Access the data} */
  866. for(i=0 ; i<NX; i++) @{
  867. fprintf(stderr, "%f ", vector[i]);
  868. @}
  869. fprintf(stderr, "\n");
  870. /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */
  871. starpu_data_release(vector_handle);
  872. starpu_data_unregister(vector_handle);
  873. starpu_shutdown();
  874. return 0;
  875. @}
  876. @end smallexample
  877. @end cartouche
  878. @node Execution of Hybrid Vector Scaling
  879. @subsection Execution of Hybrid Vector Scaling
  880. The Makefile given at the beginning of the section must be extended to
  881. give the rules to compile the CUDA source code. Note that the source
  882. file of the OpenCL kernel does not need to be compiled now, it will
  883. be compiled at run-time when calling the function
  884. @code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).
  885. @cartouche
  886. @smallexample
  887. CFLAGS += $(shell pkg-config --cflags libstarpu)
  888. LDFLAGS += $(shell pkg-config --libs libstarpu)
  889. CC = gcc
  890. vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
  891. %.o: %.cu
  892. nvcc $(CFLAGS) $< -c $@
  893. clean:
  894. rm -f vector_scal *.o
  895. @end smallexample
  896. @end cartouche
  897. @smallexample
  898. % make
  899. @end smallexample
  900. and to execute it, with the default configuration:
  901. @smallexample
  902. % ./vector_scal
  903. 0.000000 3.000000 6.000000 9.000000 12.000000
  904. @end smallexample
  905. or for example, by disabling CPU devices:
  906. @smallexample
  907. % STARPU_NCPUS=0 ./vector_scal
  908. 0.000000 3.000000 6.000000 9.000000 12.000000
  909. @end smallexample
  910. or by disabling CUDA devices (which may permit to enable the use of OpenCL,
  911. see @ref{Enabling OpenCL}):
  912. @smallexample
  913. % STARPU_NCUDA=0 ./vector_scal
  914. 0.000000 3.000000 6.000000 9.000000 12.000000
  915. @end smallexample
  916. @node Using multiple implementations of a codelet
  917. @section Using multiple implementations of a codelet
  918. One may want to write multiple implementations of a codelet for a single type of
  919. device and let StarPU choose which one to run. As an example, we will show how
  920. to use SSE to scale a vector. The codelet can be written as follows :
  921. @cartouche
  922. @smallexample
  923. #include <xmmintrin.h>
  924. void scal_sse_func(void *buffers[], void *cl_arg)
  925. @{
  926. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  927. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  928. unsigned int n_iterations = n/4;
  929. if (n % 4 != 0)
  930. n_iterations++;
  931. __m128 *VECTOR = (__m128*) vector;
  932. __m128 factor __attribute__((aligned(16)));
  933. factor = _mm_set1_ps(*(float *) cl_arg);
  934. unsigned int i;
  935. for (i = 0; i < n_iterations; i++)
  936. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  937. @}
  938. @end smallexample
  939. @end cartouche
  940. The @code{cpu_func} field of the @code{starpu_codelet} structure has to be set
  941. to the special value @code{STARPU_MULTIPLE_CPU_IMPLEMENTATIONS}. Note that
  942. @code{STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS} and
  943. @code{STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS} are also available.
  944. @cartouche
  945. @smallexample
  946. starpu_codelet cl = @{
  947. .where = STARPU_CPU,
  948. .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
  949. .cpu_funcs = @{ scal_cpu_func, scal_sse_func @},
  950. .nbuffers = 1
  951. @};
  952. @end smallexample
  953. @end cartouche
  954. The scheduler will measure the performance of all the implementations it was
  955. given, and pick the one that seems to be the fastest.
  956. @node Task and Worker Profiling
  957. @section Task and Worker Profiling
  958. A full example showing how to use the profiling API is available in
  959. the StarPU sources in the directory @code{examples/profiling/}.
  960. @cartouche
  961. @smallexample
  962. struct starpu_task *task = starpu_task_create();
  963. task->cl = &cl;
  964. task->synchronous = 1;
  965. /* We will destroy the task structure by hand so that we can
  966. * query the profiling info before the task is destroyed. */
  967. task->destroy = 0;
  968. /* Submit and wait for completion (since synchronous was set to 1) */
  969. starpu_task_submit(task);
  970. /* The task is finished, get profiling information */
  971. struct starpu_task_profiling_info *info = task->profiling_info;
  972. /* How much time did it take before the task started ? */
  973. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  974. /* How long was the task execution ? */
  975. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  976. /* We don't need the task structure anymore */
  977. starpu_task_destroy(task);
  978. @end smallexample
  979. @end cartouche
  980. @cartouche
  981. @smallexample
  982. /* Display the occupancy of all workers during the test */
  983. int worker;
  984. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  985. @{
  986. struct starpu_worker_profiling_info worker_info;
  987. int ret = starpu_worker_get_profiling_info(worker, &worker_info);
  988. STARPU_ASSERT(!ret);
  989. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  990. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  991. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  992. float executing_ratio = 100.0*executing_time/total_time;
  993. float sleeping_ratio = 100.0*sleeping_time/total_time;
  994. char workername[128];
  995. starpu_worker_get_name(worker, workername, 128);
  996. fprintf(stderr, "Worker %s:\n", workername);
  997. fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
  998. fprintf(stderr, "\texec time : %.2lf ms (%.2f %%)\n", executing_time*1e-3,
  999. executing_ratio);
  1000. fprintf(stderr, "\tblocked time : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
  1001. sleeping_ratio);
  1002. @}
  1003. @end smallexample
  1004. @end cartouche
  1005. @node Partitioning Data
  1006. @section Partitioning Data
  1007. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  1008. @cartouche
  1009. @smallexample
  1010. int vector[NX];
  1011. starpu_data_handle handle;
  1012. /* Declare data to StarPU */
  1013. starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
  1014. /* Partition the vector in PARTS sub-vectors */
  1015. starpu_filter f =
  1016. @{
  1017. .filter_func = starpu_block_filter_func_vector,
  1018. .nchildren = PARTS
  1019. @};
  1020. starpu_data_partition(handle, &f);
  1021. @end smallexample
  1022. @end cartouche
  1023. @cartouche
  1024. @smallexample
  1025. /* Submit a task on each sub-vector */
  1026. for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
  1027. /* Get subdata number i (there is only 1 dimension) */
  1028. starpu_data_handle sub_handle = starpu_data_get_sub_data(handle, 1, i);
  1029. struct starpu_task *task = starpu_task_create();
  1030. task->buffers[0].handle = sub_handle;
  1031. task->buffers[0].mode = STARPU_RW;
  1032. task->cl = &cl;
  1033. task->synchronous = 1;
  1034. task->cl_arg = &factor;
  1035. task->cl_arg_size = sizeof(factor);
  1036. starpu_task_submit(task);
  1037. @}
  1038. @end smallexample
  1039. @end cartouche
  1040. Partitioning can be applied several times, see
  1041. @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
  1042. @node Performance model example
  1043. @section Performance model example
  1044. To achieve good scheduling, StarPU scheduling policies need to be able to
  1045. estimate in advance the duration of a task. This is done by giving to codelets
  1046. a performance model, by defining a @code{starpu_perfmodel_t} structure and
  1047. providing its address in the @code{model} field of the @code{starpu_codelet}
  1048. structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel_t}
  1049. are mandatory, to give a name to the model, and the type of the model, since
  1050. there are several kinds of performance models.
  1051. @itemize
  1052. @item
  1053. Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
  1054. given set of data input/output sizes, the performance will always be about the
  1055. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  1056. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  1057. few different sets of data input/output sizes. StarPU will then keep record of
  1058. the average time of previous executions on the various processing units, and use
  1059. it as an estimation. History is done per task size, by using a hash of the input
  1060. and ouput sizes as an index.
  1061. It will also save it in @code{~/.starpu/sampling/codelets}
  1062. for further executions, and can be observed by using the
  1063. @code{starpu_perfmodel_display} command, or drawn by using
  1064. the @code{starpu_perfmodel_plot}. The models are indexed by machine name. To
  1065. share the models between machines (e.g. for a homogeneous cluster), use
  1066. @code{export STARPU_HOSTNAME=some_global_name}. The following is a small code
  1067. example.
  1068. If e.g. the code is recompiled with other compilation options, or several
  1069. variants of the code are used, the symbol string should be changed to reflect
  1070. that, in order to recalibrate a new model from zero. The symbol string can even
  1071. be constructed dynamically at execution time, as long as this is done before
  1072. submitting any task using it.
  1073. @cartouche
  1074. @smallexample
  1075. static struct starpu_perfmodel_t mult_perf_model = @{
  1076. .type = STARPU_HISTORY_BASED,
  1077. .symbol = "mult_perf_model"
  1078. @};
  1079. starpu_codelet cl = @{
  1080. .where = STARPU_CPU,
  1081. .cpu_func = cpu_mult,
  1082. .nbuffers = 3,
  1083. /* for the scheduling policy to be able to use performance models */
  1084. .model = &mult_perf_model
  1085. @};
  1086. @end smallexample
  1087. @end cartouche
  1088. @item
  1089. Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
  1090. model type). This still assumes performance regularity, but can work
  1091. with various data input sizes, by applying regression over observed
  1092. execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
  1093. form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  1094. STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
  1095. @code{tests/perfmodels/regression_based.c} uses a regression-based performance
  1096. model for the @code{memset} operation.
  1097. @item
  1098. Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
  1099. see for instance
  1100. @code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
  1101. @item
  1102. Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
  1103. @code{.per_arch[i].cost_model} fields have to be filled with pointers to
  1104. functions which return the expected duration of the task in micro-seconds, one
  1105. per architecture.
  1106. @end itemize
  1107. How to use schedulers which can benefit from such performance model is explained
  1108. in @ref{Task scheduling policy}.
  1109. The same can be done for task power consumption estimation, by setting the
  1110. @code{power_model} field the same way as the @code{model} field. Note: for
  1111. now, the application has to give to the power consumption performance model
  1112. a name which is different from the execution time performance model.
  1113. The application can request time estimations from the StarPU performance
  1114. models by filling a task structure as usual without actually submitting
  1115. it. The data handles can be created by calling @code{starpu_data_register}
  1116. functions with a @code{NULL} pointer (and need to be unregistered as usual)
  1117. and the desired data sizes. The @code{starpu_task_expected_length} and
  1118. @code{starpu_task_expected_power} functions can then be called to get an
  1119. estimation of the task duration on a given arch. @code{starpu_task_destroy}
  1120. needs to be called to destroy the dummy task afterwards. See
  1121. @code{tests/perfmodels/regression_based.c} for an example.
  1122. @node Theoretical lower bound on execution time
  1123. @section Theoretical lower bound on execution time
  1124. For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
  1125. bound for the execution time of a whole set of tasks. See for
  1126. instance @code{examples/lu/lu_example.c}: before submitting tasks,
  1127. call @code{starpu_bound_start}, and after complete execution, call
  1128. @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
  1129. @code{starpu_bound_print_mps} can then be used to output a Linear Programming
  1130. problem corresponding to the schedule of your tasks. Run it through
  1131. @code{lp_solve} or any other linear programming solver, and that will give you a
  1132. lower bound for the total execution time of your tasks. If StarPU was compiled
  1133. with the glpk library installed, @code{starpu_bound_compute} can be used to
  1134. solve it immediately and get the optimized minimum. Its @code{integer}
  1135. parameter allows to decide whether integer resolution should be computed
  1136. and returned.
  1137. The @code{deps} parameter tells StarPU whether to take tasks and implicit data
  1138. dependencies into account. It must be understood that the linear programming
  1139. problem size is quadratic with the number of tasks and thus the time to solve it
  1140. will be very long, it could be minutes for just a few dozen tasks. You should
  1141. probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
  1142. problem to MPS format and then use a better solver, @code{glpsol} might be
  1143. better than @code{lp_solve} for instance (the @code{--pcost} option may be
  1144. useful), but sometimes doesn't manage to converge. @code{cbc} might look
  1145. slower, but it is parallel. Be sure to try at least all the @code{-B} options
  1146. of @code{lp_solve}. For instance, we often just use
  1147. @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
  1148. the @code{-gr} option can also be quite useful.
  1149. Setting @code{deps} to 0 will only take into account the actual computations
  1150. on processing units. It however still properly takes into account the varying
  1151. performances of kernels and processing units, which is quite more accurate than
  1152. just comparing StarPU performances with the fastest of the kernels being used.
  1153. The @code{prio} parameter tells StarPU whether to simulate taking into account
  1154. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  1155. tasks before less prioritized tasks, to check to which extend this results
  1156. to a less optimal solution. This increases even more computation time.
  1157. Note that for simplicity, all this however doesn't take into account data
  1158. transfers, which are assumed to be completely overlapped.
  1159. @node Insert Task Utility
  1160. @section Insert Task Utility
  1161. StarPU provides the wrapper function @code{starpu_insert_task} to ease
  1162. the creation and submission of tasks.
  1163. @deftypefun int starpu_insert_task (starpu_codelet *@var{cl}, ...)
  1164. Create and submit a task corresponding to @var{cl} with the following
  1165. arguments. The argument list must be zero-terminated.
  1166. The arguments following the codelets can be of the following types:
  1167. @itemize
  1168. @item
  1169. @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
  1170. @item
  1171. @code{STARPU_VALUE} followed by a pointer to a constant value and
  1172. the size of the constant;
  1173. @item
  1174. @code{STARPU_CALLBACK} followed by a pointer to a callback function;
  1175. @item
  1176. @code{STARPU_CALLBACK_ARG} followed by a pointer to be given as an
  1177. argument to the callback function;
  1178. @item
  1179. @code{STARPU_PRIORITY} followed by a integer defining a priority level.
  1180. @end itemize
  1181. Parameters to be passed to the codelet implementation are defined
  1182. through the type @code{STARPU_VALUE}. The function
  1183. @code{starpu_unpack_cl_args} must be called within the codelet
  1184. implementation to retrieve them.
  1185. @end deftypefun
  1186. Here the implementation of the codelet:
  1187. @smallexample
  1188. void func_cpu(void *descr[], void *_args)
  1189. @{
  1190. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  1191. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  1192. int ifactor;
  1193. float ffactor;
  1194. starpu_unpack_cl_args(_args, &ifactor, &ffactor);
  1195. *x0 = *x0 * ifactor;
  1196. *x1 = *x1 * ffactor;
  1197. @}
  1198. starpu_codelet mycodelet = @{
  1199. .where = STARPU_CPU,
  1200. .cpu_func = func_cpu,
  1201. .nbuffers = 2
  1202. @};
  1203. @end smallexample
  1204. And the call to the @code{starpu_insert_task} wrapper:
  1205. @smallexample
  1206. starpu_insert_task(&mycodelet,
  1207. STARPU_VALUE, &ifactor, sizeof(ifactor),
  1208. STARPU_VALUE, &ffactor, sizeof(ffactor),
  1209. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  1210. 0);
  1211. @end smallexample
  1212. The call to @code{starpu_insert_task} is equivalent to the following
  1213. code:
  1214. @smallexample
  1215. struct starpu_task *task = starpu_task_create();
  1216. task->cl = &mycodelet;
  1217. task->buffers[0].handle = data_handles[0];
  1218. task->buffers[0].mode = STARPU_RW;
  1219. task->buffers[1].handle = data_handles[1];
  1220. task->buffers[1].mode = STARPU_RW;
  1221. char *arg_buffer;
  1222. size_t arg_buffer_size;
  1223. starpu_pack_cl_args(&arg_buffer, &arg_buffer_size,
  1224. STARPU_VALUE, &ifactor, sizeof(ifactor),
  1225. STARPU_VALUE, &ffactor, sizeof(ffactor),
  1226. 0);
  1227. task->cl_arg = arg_buffer;
  1228. task->cl_arg_size = arg_buffer_size;
  1229. int ret = starpu_task_submit(task);
  1230. @end smallexample
  1231. If some part of the task insertion depends on the value of some computation,
  1232. the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
  1233. instance, assuming that the index variable @code{i} was registered as handle
  1234. @code{i_handle}:
  1235. @smallexample
  1236. /* Compute which portion we will work on, e.g. pivot */
  1237. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  1238. /* And submit the corresponding task */
  1239. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  1240. @end smallexample
  1241. The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
  1242. acquiring data @code{i} for the main application, and will execute the code
  1243. given as third parameter when it is acquired. In other words, as soon as the
  1244. value of @code{i} computed by the @code{which_index} codelet can be read, the
  1245. portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
  1246. be executed, and is allowed to read from @code{i} to use it e.g. as an
  1247. index. Note that this macro is only avaible when compiling StarPU with
  1248. the compiler @code{gcc}.
  1249. @node Debugging
  1250. @section Debugging
  1251. StarPU provides several tools to help debugging aplications. Execution traces
  1252. can be generated and displayed graphically, see @ref{Generating traces}. Some
  1253. gdb helpers are also provided to show the whole StarPU state:
  1254. @smallexample
  1255. (gdb) source tools/gdbinit
  1256. (gdb) help starpu
  1257. @end smallexample
  1258. @node More examples
  1259. @section More examples
  1260. More examples are available in the StarPU sources in the @code{examples/}
  1261. directory. Simple examples include:
  1262. @table @asis
  1263. @item @code{incrementer/}:
  1264. Trivial incrementation test.
  1265. @item @code{basic_examples/}:
  1266. Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
  1267. in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
  1268. product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
  1269. interface, and an example using the variable data interface.
  1270. @item @code{matvecmult/}:
  1271. OpenCL example from NVidia, adapted to StarPU.
  1272. @item @code{axpy/}:
  1273. AXPY CUBLAS operation adapted to StarPU.
  1274. @item @code{fortran/}:
  1275. Example of Fortran bindings.
  1276. @end table
  1277. More advanced examples include:
  1278. @table @asis
  1279. @item @code{filters/}:
  1280. Examples using filters, as shown in @ref{Partitioning Data}.
  1281. @item @code{lu/}:
  1282. LU matrix factorization, see for instance @code{xlu_implicit.c}
  1283. @item @code{cholesky/}:
  1284. Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
  1285. @end table
  1286. @c ---------------------------------------------------------------------
  1287. @c Performance options
  1288. @c ---------------------------------------------------------------------
  1289. @node Performance optimization
  1290. @chapter How to optimize performance with StarPU
  1291. TODO: improve!
  1292. @menu
  1293. * Data management::
  1294. * Task submission::
  1295. * Task priorities::
  1296. * Task scheduling policy::
  1297. * Performance model calibration::
  1298. * Task distribution vs Data transfer::
  1299. * Data prefetch::
  1300. * Power-based scheduling::
  1301. * Profiling::
  1302. * CUDA-specific optimizations::
  1303. @end menu
  1304. Simply encapsulating application kernels into tasks already permits to
  1305. seamlessly support CPU and GPUs at the same time. To achieve good performance, a
  1306. few additional changes are needed.
  1307. @node Data management
  1308. @section Data management
  1309. When the application allocates data, whenever possible it should use the
  1310. @code{starpu_malloc} function, which will ask CUDA or
  1311. OpenCL to make the allocation itself and pin the corresponding allocated
  1312. memory. This is needed to permit asynchronous data transfer, i.e. permit data
  1313. transfer to overlap with computations. Otherwise, the trace will show that the
  1314. @code{DriverCopyAsync} state takes a lot of time, this is because CUDA or OpenCL
  1315. then reverts to synchronous transfers.
  1316. By default, StarPU leaves replicates of data wherever they were used, in case they
  1317. will be re-used by other tasks, thus saving the data transfer time. When some
  1318. task modifies some data, all the other replicates are invalidated, and only the
  1319. processing unit which ran that task will have a valid replicate of the data. If the application knows
  1320. that this data will not be re-used by further tasks, it should advise StarPU to
  1321. immediately replicate it to a desired list of memory nodes (given through a
  1322. bitmask). This can be understood like the write-through mode of CPU caches.
  1323. @example
  1324. starpu_data_set_wt_mask(img_handle, 1<<0);
  1325. @end example
  1326. will for instance request to always automatically transfer a replicate into the
  1327. main memory (node 0), as bit 0 of the write-through bitmask is being set.
  1328. @example
  1329. starpu_data_set_wt_mask(img_handle, ~0U);
  1330. @end example
  1331. will request to always automatically broadcast the updated data to all memory
  1332. nodes.
  1333. @node Task submission
  1334. @section Task submission
  1335. To let StarPU make online optimizations, tasks should be submitted
  1336. asynchronously as much as possible. Ideally, all the tasks should be
  1337. submitted, and mere calls to @code{starpu_task_wait_for_all} or
  1338. @code{starpu_data_unregister} be done to wait for
  1339. termination. StarPU will then be able to rework the whole schedule, overlap
  1340. computation with communication, manage accelerator local memory usage, etc.
  1341. @node Task priorities
  1342. @section Task priorities
  1343. By default, StarPU will consider the tasks in the order they are submitted by
  1344. the application. If the application programmer knows that some tasks should
  1345. be performed in priority (for instance because their output is needed by many
  1346. other tasks and may thus be a bottleneck if not executed early enough), the
  1347. @code{priority} field of the task structure should be set to transmit the
  1348. priority information to StarPU.
  1349. @node Task scheduling policy
  1350. @section Task scheduling policy
  1351. By default, StarPU uses the @code{eager} simple greedy scheduler. This is
  1352. because it provides correct load balance even if the application codelets do not
  1353. have performance models. If your application codelets have performance models
  1354. (@pxref{Performance model example} for examples showing how to do it),
  1355. you should change the scheduler thanks to the @code{STARPU_SCHED} environment
  1356. variable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to get
  1357. the list of available schedulers.
  1358. The @b{eager} scheduler uses a central task queue, from which workers draw tasks
  1359. to work on. This however does not permit to prefetch data since the scheduling
  1360. decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
  1361. The @b{prio} scheduler also uses a central task queue, but sorts tasks by
  1362. priority (between -5 and 5).
  1363. The @b{random} scheduler distributes tasks randomly according to assumed worker
  1364. overall performance.
  1365. The @b{ws} (work stealing) scheduler schedules tasks on the local worker by
  1366. default. When a worker becomes idle, it steals a task from the most loaded
  1367. worker.
  1368. The @b{dm} (deque model) scheduler uses task execution performance models into account to
  1369. perform an HEFT-similar scheduling strategy: it schedules tasks where their
  1370. termination time will be minimal.
  1371. The @b{dmda} (deque model data aware) scheduler is similar to dm, it also takes
  1372. into account data transfer time.
  1373. The @b{dmdar} (deque model data aware ready) scheduler is similar to dmda,
  1374. it also sorts tasks on per-worker queues by number of already-available data
  1375. buffers.
  1376. The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
  1377. also supports arbitrary priority values.
  1378. The @b{heft} (HEFT) scheduler is similar to dmda, it also supports task bundles.
  1379. The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
  1380. parallel tasks (still experimental).
  1381. The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
  1382. supports parallel tasks (still experimental).
  1383. @node Performance model calibration
  1384. @section Performance model calibration
  1385. Most schedulers are based on an estimation of codelet duration on each kind
  1386. of processing unit. For this to be possible, the application programmer needs
  1387. to configure a performance model for the codelets of the application (see
  1388. @ref{Performance model example} for instance). History-based performance models
  1389. use on-line calibration. StarPU will automatically calibrate codelets
  1390. which have never been calibrated yet, and save the result in
  1391. @code{~/.starpu/sampling/codelets}.
  1392. The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
  1393. @code{export STARPU_CALIBRATE=1} . This may be necessary if your application
  1394. has not-so-stable performance. StarPU will force calibration (and thus ignore
  1395. the current result) until 10 (STARPU_CALIBRATION_MINIMUM) measurements have been
  1396. made on each architecture, to avoid badly scheduling tasks just because the
  1397. first measurements were not so good. Details on the current performance model status
  1398. can be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}
  1399. option lists the available performance models, and the @code{-s} option permits
  1400. to choose the performance model to be displayed. The result looks like:
  1401. @example
  1402. $ starpu_perfmodel_display -s starpu_dlu_lu_model_22
  1403. performance model for cpu
  1404. # hash size mean dev n
  1405. 880805ba 98304 2.731309e+02 6.010210e+01 1240
  1406. b50b6605 393216 1.469926e+03 1.088828e+02 1240
  1407. 5c6c3401 1572864 1.125983e+04 3.265296e+03 1240
  1408. @end example
  1409. Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
  1410. execution time on CPUs was about 12ms, with a 2ms standard deviation, over
  1411. 1240 samples. It is a good idea to check this before doing actual performance
  1412. measurements.
  1413. A graph can be drawn by using the @code{starpu_perfmodel_plot}:
  1414. @example
  1415. $ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
  1416. 98304 393216 1572864
  1417. $ gnuplot starpu_starpu_dlu_lu_model_22.gp
  1418. $ gv starpu_starpu_dlu_lu_model_22.eps
  1419. @end example
  1420. If a kernel source code was modified (e.g. performance improvement), the
  1421. calibration information is stale and should be dropped, to re-calibrate from
  1422. start. This can be done by using @code{export STARPU_CALIBRATE=2}.
  1423. Note: due to CUDA limitations, to be able to measure kernel duration,
  1424. calibration mode needs to disable asynchronous data transfers. Calibration thus
  1425. disables data transfer / computation overlapping, and should thus not be used
  1426. for eventual benchmarks. Note 2: history-based performance models get calibrated
  1427. only if a performance-model-based scheduler is chosen.
  1428. @node Task distribution vs Data transfer
  1429. @section Task distribution vs Data transfer
  1430. Distributing tasks to balance the load induces data transfer penalty. StarPU
  1431. thus needs to find a balance between both. The target function that the
  1432. @code{dmda} scheduler of StarPU
  1433. tries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where
  1434. @code{T_execution} is the estimated execution time of the codelet (usually
  1435. accurate), and @code{T_data_transfer} is the estimated data transfer time. The
  1436. latter is estimated based on bus calibration before execution start,
  1437. i.e. with an idle machine, thus without contention. You can force bus re-calibration by running
  1438. @code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can be
  1439. worth trying to tweak it by using @code{export STARPU_BETA=2} for instance,
  1440. since during real application execution, contention makes transfer times bigger.
  1441. This is of course imprecise, but in practice, a rough estimation already gives
  1442. the good results that a precise estimation would give.
  1443. @node Data prefetch
  1444. @section Data prefetch
  1445. The @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
  1446. as soon as a scheduling decision is taken for a task, requests are issued to
  1447. transfer its required data to the target processing unit, if needeed, so that
  1448. when the processing unit actually starts the task, its data will hopefully be
  1449. already available and it will not have to wait for the transfer to finish.
  1450. The application may want to perform some manual prefetching, for several reasons
  1451. such as excluding initial data transfers from performance measurements, or
  1452. setting up an initial statically-computed data distribution on the machine
  1453. before submitting tasks, which will thus guide StarPU toward an initial task
  1454. distribution (since StarPU will try to avoid further transfers).
  1455. This can be achieved by giving the @code{starpu_data_prefetch_on_node} function
  1456. the handle and the desired target memory node.
  1457. @node Power-based scheduling
  1458. @section Power-based scheduling
  1459. If the application can provide some power performance model (through
  1460. the @code{power_model} field of the codelet structure), StarPU will
  1461. take it into account when distributing tasks. The target function that
  1462. the @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +
  1463. beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}
  1464. is the estimated task consumption in Joules. To tune this parameter, use
  1465. @code{export STARPU_GAMMA=3000} for instance, to express that each Joule
  1466. (i.e kW during 1000us) is worth 3000us execution time penalty. Setting
  1467. @code{alpha} and @code{beta} to zero permits to only take into account power consumption.
  1468. This is however not sufficient to correctly optimize power: the scheduler would
  1469. simply tend to run all computations on the most energy-conservative processing
  1470. unit. To account for the consumption of the whole machine (including idle
  1471. processing units), the idle power of the machine should be given by setting
  1472. @code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can often
  1473. be obtained from the machine power supplier.
  1474. The power actually consumed by the total execution can be displayed by setting
  1475. @code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
  1476. @node Profiling
  1477. @section Profiling
  1478. A quick view of how many tasks each worker has executed can be obtained by setting
  1479. @code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
  1480. execution did happen on accelerators without penalizing performance with
  1481. the profiling overhead.
  1482. A quick view of how much data transfers have been issued can be obtained by setting
  1483. @code{export STARPU_BUS_STATS=1} .
  1484. More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
  1485. calling @code{starpu_profiling_status_set} from the source code.
  1486. Statistics on the execution can then be obtained by using @code{export
  1487. STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
  1488. More details on performance feedback are provided by the next chapter.
  1489. @node CUDA-specific optimizations
  1490. @section CUDA-specific optimizations
  1491. Due to CUDA limitations, StarPU will have a hard time overlapping its own
  1492. communications and the codelet computations if the application does not use a
  1493. dedicated CUDA stream for its computations. StarPU provides one by the use of
  1494. @code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
  1495. operations. For instance:
  1496. @example
  1497. func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
  1498. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  1499. @end example
  1500. StarPU already does appropriate calls for the CUBLAS library.
  1501. Unfortunately, some CUDA libraries do not have stream variants of
  1502. kernels. That will lower the potential for overlapping.
  1503. @c ---------------------------------------------------------------------
  1504. @c Performance feedback
  1505. @c ---------------------------------------------------------------------
  1506. @node Performance feedback
  1507. @chapter Performance feedback
  1508. @menu
  1509. * On-line:: On-line performance feedback
  1510. * Off-line:: Off-line performance feedback
  1511. * Codelet performance:: Performance of codelets
  1512. @end menu
  1513. @node On-line
  1514. @section On-line performance feedback
  1515. @menu
  1516. * Enabling monitoring:: Enabling on-line performance monitoring
  1517. * Task feedback:: Per-task feedback
  1518. * Codelet feedback:: Per-codelet feedback
  1519. * Worker feedback:: Per-worker feedback
  1520. * Bus feedback:: Bus-related feedback
  1521. * StarPU-Top:: StarPU-Top interface
  1522. @end menu
  1523. @node Enabling monitoring
  1524. @subsection Enabling on-line performance monitoring
  1525. In order to enable online performance monitoring, the application can call
  1526. @code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible to
  1527. detect whether monitoring is already enabled or not by calling
  1528. @code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize all
  1529. previously collected feedback. The @code{STARPU_PROFILING} environment variable
  1530. can also be set to 1 to achieve the same effect.
  1531. Likewise, performance monitoring is stopped by calling
  1532. @code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that this
  1533. does not reset the performance counters so that the application may consult
  1534. them later on.
  1535. More details about the performance monitoring API are available in section
  1536. @ref{Profiling API}.
  1537. @node Task feedback
  1538. @subsection Per-task feedback
  1539. If profiling is enabled, a pointer to a @code{starpu_task_profiling_info}
  1540. structure is put in the @code{.profiling_info} field of the @code{starpu_task}
  1541. structure when a task terminates.
  1542. This structure is automatically destroyed when the task structure is destroyed,
  1543. either automatically or by calling @code{starpu_task_destroy}.
  1544. The @code{starpu_task_profiling_info} structure indicates the date when the
  1545. task was submitted (@code{submit_time}), started (@code{start_time}), and
  1546. terminated (@code{end_time}), relative to the initialization of
  1547. StarPU with @code{starpu_init}. It also specifies the identifier of the worker
  1548. that has executed the task (@code{workerid}).
  1549. These date are stored as @code{timespec} structures which the user may convert
  1550. into micro-seconds using the @code{starpu_timing_timespec_to_us} helper
  1551. function.
  1552. It it worth noting that the application may directly access this structure from
  1553. the callback executed at the end of the task. The @code{starpu_task} structure
  1554. associated to the callback currently being executed is indeed accessible with
  1555. the @code{starpu_get_current_task()} function.
  1556. @node Codelet feedback
  1557. @subsection Per-codelet feedback
  1558. The @code{per_worker_stats} field of the @code{starpu_codelet_t} structure is
  1559. an array of counters. The i-th entry of the array is incremented every time a
  1560. task implementing the codelet is executed on the i-th worker.
  1561. This array is not reinitialized when profiling is enabled or disabled.
  1562. @node Worker feedback
  1563. @subsection Per-worker feedback
  1564. The second argument returned by the @code{starpu_worker_get_profiling_info}
  1565. function is a @code{starpu_worker_profiling_info} structure that gives
  1566. statistics about the specified worker. This structure specifies when StarPU
  1567. started collecting profiling information for that worker (@code{start_time}),
  1568. the duration of the profiling measurement interval (@code{total_time}), the
  1569. time spent executing kernels (@code{executing_time}), the time spent sleeping
  1570. because there is no task to execute at all (@code{sleeping_time}), and the
  1571. number of tasks that were executed while profiling was enabled.
  1572. These values give an estimation of the proportion of time spent do real work,
  1573. and the time spent either sleeping because there are not enough executable
  1574. tasks or simply wasted in pure StarPU overhead.
  1575. Calling @code{starpu_worker_get_profiling_info} resets the profiling
  1576. information associated to a worker.
  1577. When an FxT trace is generated (see @ref{Generating traces}), it is also
  1578. possible to use the @code{starpu_top} script (described in @ref{starpu-top}) to
  1579. generate a graphic showing the evolution of these values during the time, for
  1580. the different workers.
  1581. @node Bus feedback
  1582. @subsection Bus-related feedback
  1583. TODO
  1584. @c how to enable/disable performance monitoring
  1585. @c what kind of information do we get ?
  1586. The bus speed measured by StarPU can be displayed by using the
  1587. @code{starpu_machine_display} tool, for instance:
  1588. @example
  1589. StarPU has found :
  1590. 3 CUDA devices
  1591. CUDA 0 (Tesla C2050 02:00.0)
  1592. CUDA 1 (Tesla C2050 03:00.0)
  1593. CUDA 2 (Tesla C2050 84:00.0)
  1594. from to RAM to CUDA 0 to CUDA 1 to CUDA 2
  1595. RAM 0.000000 5176.530428 5176.492994 5191.710722
  1596. CUDA 0 4523.732446 0.000000 2414.074751 2417.379201
  1597. CUDA 1 4523.718152 2414.078822 0.000000 2417.375119
  1598. CUDA 2 4534.229519 2417.069025 2417.060863 0.000000
  1599. @end example
  1600. @node StarPU-Top
  1601. @subsection StarPU-Top interface
  1602. StarPU-Top is an interface which remotely displays the on-line state of a StarPU
  1603. application and permits the user to change parameters on the fly.
  1604. Variables to be monitored can be registered by calling the
  1605. @code{starputop_add_data_boolean}, @code{starputop_add_data_integer},
  1606. @code{starputop_add_data_float} functions, e.g.:
  1607. @example
  1608. starputop_data *data = starputop_add_data_integer("mynum", 0, 100, 1);
  1609. @end example
  1610. The application should then call @code{starputop_init_and_wait} to give its name
  1611. and wait for StarPU-Top to get a start request from the user. The name is used
  1612. by StarPU-Top to quickly reload a previously-saved layout of parameter display.
  1613. @example
  1614. starputop_init_and_wait("the application");
  1615. @end example
  1616. The new values can then be provided thanks to
  1617. @code{starputop_update_data_boolean}, @code{starputop_update_data_integer},
  1618. @code{starputop_update_data_float}, e.g.:
  1619. @example
  1620. starputop_update_data_integer(data, mynum);
  1621. @end example
  1622. Updateable parameters can be registered thanks to @code{starputop_register_parameter_boolean}, @code{starputop_register_parameter_integer}, @code{starputop_register_parameter_float}, e.g.:
  1623. @example
  1624. float apha;
  1625. starputop_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
  1626. @end example
  1627. @code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
  1628. @example
  1629. void modif_hook(struct starputop_param_t *d) @{
  1630. fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
  1631. @}
  1632. @end example
  1633. Task schedulers should notify StarPU-Top when it has decided when a task will be
  1634. scheduled, so that it can show it in its Gantt chart, for instance:
  1635. @example
  1636. starputop_task_prevision(task, workerid, begin, end);
  1637. @end example
  1638. Starting StarPU-Top and the application can be done two ways:
  1639. @itemize
  1640. @item The application is started by hand on some machine (and thus already
  1641. waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
  1642. checkbox should be unchecked, and the hostname and port (default is 2011) on
  1643. which the application is already running should be specified. Clicking on the
  1644. connection button will thus connect to the already-running application.
  1645. @item StarPU-Top is started first, and clicking on the connection button will
  1646. start the application itself (possibly on a remote machine). The SSH checkbox
  1647. should be checked, and a command line provided, e.g.:
  1648. @example
  1649. ssh myserver STARPU_SCHED=heft ./application
  1650. @end example
  1651. If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
  1652. @example
  1653. ssh -L 2011:localhost:2011 myserver STARPU_SCHED=heft ./application
  1654. @end example
  1655. and "localhost" should be used as IP Address to connect to.
  1656. @end itemize
  1657. @node Off-line
  1658. @section Off-line performance feedback
  1659. @menu
  1660. * Generating traces:: Generating traces with FxT
  1661. * Gantt diagram:: Creating a Gantt Diagram
  1662. * DAG:: Creating a DAG with graphviz
  1663. * starpu-top:: Monitoring activity
  1664. @end menu
  1665. @node Generating traces
  1666. @subsection Generating traces with FxT
  1667. StarPU can use the FxT library (see
  1668. @indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traces
  1669. with a limited runtime overhead.
  1670. You can either get a tarball:
  1671. @example
  1672. % wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz
  1673. @end example
  1674. or use the FxT library from CVS (autotools are required):
  1675. @example
  1676. % cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
  1677. % ./bootstrap
  1678. @end example
  1679. Compiling and installing the FxT library in the @code{$FXTDIR} path is
  1680. done following the standard procedure:
  1681. @example
  1682. % ./configure --prefix=$FXTDIR
  1683. % make
  1684. % make install
  1685. @end example
  1686. In order to have StarPU to generate traces, StarPU should be configured with
  1687. the @code{--with-fxt} option:
  1688. @example
  1689. $ ./configure --with-fxt=$FXTDIR
  1690. @end example
  1691. Or you can simply point the @code{PKG_CONFIG_PATH} to
  1692. @code{$FXTDIR/lib/pkgconfig} and pass @code{--with-fxt} to @code{./configure}
  1693. When FxT is enabled, a trace is generated when StarPU is terminated by calling
  1694. @code{starpu_shutdown()}). The trace is a binary file whose name has the form
  1695. @code{prof_file_XXX_YYY} where @code{XXX} is the user name, and
  1696. @code{YYY} is the pid of the process that used StarPU. This file is saved in the
  1697. @code{/tmp/} directory by default, or by the directory specified by
  1698. the @code{STARPU_FXT_PREFIX} environment variable.
  1699. @node Gantt diagram
  1700. @subsection Creating a Gantt Diagram
  1701. When the FxT trace file @code{filename} has been generated, it is possible to
  1702. generate a trace in the Paje format by calling:
  1703. @example
  1704. % starpu_fxt_tool -i filename
  1705. @end example
  1706. Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
  1707. to 1 before application execution will make StarPU do it automatically at
  1708. application shutdown.
  1709. This will create a @code{paje.trace} file in the current directory that can be
  1710. inspected with the ViTE trace visualizing open-source tool. More information
  1711. about ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
  1712. possible to open the @code{paje.trace} file with ViTE by using the following
  1713. command:
  1714. @example
  1715. % vite paje.trace
  1716. @end example
  1717. @node DAG
  1718. @subsection Creating a DAG with graphviz
  1719. When the FxT trace file @code{filename} has been generated, it is possible to
  1720. generate a task graph in the DOT format by calling:
  1721. @example
  1722. $ starpu_fxt_tool -i filename
  1723. @end example
  1724. This will create a @code{dag.dot} file in the current directory. This file is a
  1725. task graph described using the DOT language. It is possible to get a
  1726. graphical output of the graph by using the graphviz library:
  1727. @example
  1728. $ dot -Tpdf dag.dot -o output.pdf
  1729. @end example
  1730. @node starpu-top
  1731. @subsection Monitoring activity
  1732. When the FxT trace file @code{filename} has been generated, it is possible to
  1733. generate a activity trace by calling:
  1734. @example
  1735. $ starpu_fxt_tool -i filename
  1736. @end example
  1737. This will create an @code{activity.data} file in the current
  1738. directory. A profile of the application showing the activity of StarPU
  1739. during the execution of the program can be generated:
  1740. @example
  1741. $ starpu_top activity.data
  1742. @end example
  1743. This will create a file named @code{activity.eps} in the current directory.
  1744. This picture is composed of two parts.
  1745. The first part shows the activity of the different workers. The green sections
  1746. indicate which proportion of the time was spent executed kernels on the
  1747. processing unit. The red sections indicate the proportion of time spent in
  1748. StartPU: an important overhead may indicate that the granularity may be too
  1749. low, and that bigger tasks may be appropriate to use the processing unit more
  1750. efficiently. The black sections indicate that the processing unit was blocked
  1751. because there was no task to process: this may indicate a lack of parallelism
  1752. which may be alleviated by creating more tasks when it is possible.
  1753. The second part of the @code{activity.eps} picture is a graph showing the
  1754. evolution of the number of tasks available in the system during the execution.
  1755. Ready tasks are shown in black, and tasks that are submitted but not
  1756. schedulable yet are shown in grey.
  1757. @node Codelet performance
  1758. @section Performance of codelets
  1759. The performance model of codelets can be examined by using the
  1760. @code{starpu_perfmodel_display} tool:
  1761. @example
  1762. $ starpu_perfmodel_display -l
  1763. file: <malloc_pinned.hannibal>
  1764. file: <starpu_slu_lu_model_21.hannibal>
  1765. file: <starpu_slu_lu_model_11.hannibal>
  1766. file: <starpu_slu_lu_model_22.hannibal>
  1767. file: <starpu_slu_lu_model_12.hannibal>
  1768. @end example
  1769. Here, the codelets of the lu example are available. We can examine the
  1770. performance of the 22 kernel:
  1771. @example
  1772. $ starpu_perfmodel_display -s starpu_slu_lu_model_22
  1773. performance model for cpu
  1774. # hash size mean dev n
  1775. 57618ab0 19660800 2.851069e+05 1.829369e+04 109
  1776. performance model for cuda_0
  1777. # hash size mean dev n
  1778. 57618ab0 19660800 1.164144e+04 1.556094e+01 315
  1779. performance model for cuda_1
  1780. # hash size mean dev n
  1781. 57618ab0 19660800 1.164271e+04 1.330628e+01 360
  1782. performance model for cuda_2
  1783. # hash size mean dev n
  1784. 57618ab0 19660800 1.166730e+04 3.390395e+02 456
  1785. @end example
  1786. We can see that for the given size, over a sample of a few hundreds of
  1787. execution, the GPUs are about 20 times faster than the CPUs (numbers are in
  1788. us). The standard deviation is extremely low for the GPUs, and less than 10% for
  1789. CPUs.
  1790. The @code{starpu_regression_display} tool does the same for regression-based
  1791. performance models. It also writes a @code{.gp} file in the current directory,
  1792. to be run in the @code{gnuplot} tool, which shows the corresponding curve.
  1793. @c ---------------------------------------------------------------------
  1794. @c MPI support
  1795. @c ---------------------------------------------------------------------
  1796. @node StarPU MPI support
  1797. @chapter StarPU MPI support
  1798. The integration of MPI transfers within task parallelism is done in a
  1799. very natural way by the means of asynchronous interactions between the
  1800. application and StarPU. This is implemented in a separate libstarpumpi library
  1801. which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
  1802. @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
  1803. GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI. The user has to
  1804. use the usual @code{mpirun} command of the MPI implementation to start StarPU on
  1805. the different MPI nodes.
  1806. @menu
  1807. * The API::
  1808. * Simple Example::
  1809. * MPI Insert Task Utility::
  1810. * MPI Collective Operations::
  1811. @end menu
  1812. @node The API
  1813. @section The API
  1814. @subsection Initialisation
  1815. @deftypefun int starpu_mpi_initialize (void)
  1816. Initializes the starpumpi library. This must be called between calling
  1817. @code{starpu_init} and other @code{starpu_mpi} functions. This
  1818. function does not call @code{MPI_Init}, it should be called beforehand.
  1819. @end deftypefun
  1820. @deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
  1821. Initializes the starpumpi library. This must be called between calling
  1822. @code{starpu_init} and other @code{starpu_mpi} functions.
  1823. This function calls @code{MPI_Init}, and therefore should be prefered
  1824. to the previous one for MPI implementations which are not thread-safe.
  1825. Returns the current MPI node rank and world size.
  1826. @end deftypefun
  1827. @deftypefun int starpu_mpi_shutdown (void)
  1828. Cleans the starpumpi library. This must be called between calling
  1829. @code{starpu_mpi} functions and @code{starpu_shutdown}.
  1830. @code{MPI_Finalize} will be called if StarPU-MPI has been initialized
  1831. by calling @code{starpu_mpi_initialize_extended}.
  1832. @end deftypefun
  1833. @subsection Communication
  1834. @deftypefun int starpu_mpi_send (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
  1835. @end deftypefun
  1836. @deftypefun int starpu_mpi_recv (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
  1837. @end deftypefun
  1838. @deftypefun int starpu_mpi_isend (starpu_data_handle @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
  1839. @end deftypefun
  1840. @deftypefun int starpu_mpi_irecv (starpu_data_handle @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
  1841. @end deftypefun
  1842. @deftypefun int starpu_mpi_isend_detached (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
  1843. @end deftypefun
  1844. @deftypefun int starpu_mpi_irecv_detached (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
  1845. @end deftypefun
  1846. @deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
  1847. @end deftypefun
  1848. @deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
  1849. @end deftypefun
  1850. @deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
  1851. @end deftypefun
  1852. @deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
  1853. When the transfer is completed, the tag is unlocked
  1854. @end deftypefun
  1855. @deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
  1856. @end deftypefun
  1857. @deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
  1858. Asynchronously send an array of buffers, and unlocks the tag once all
  1859. of them are transmitted.
  1860. @end deftypefun
  1861. @deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
  1862. @end deftypefun
  1863. @page
  1864. @node Simple Example
  1865. @section Simple Example
  1866. @cartouche
  1867. @smallexample
  1868. void increment_token(void)
  1869. @{
  1870. struct starpu_task *task = starpu_task_create();
  1871. task->cl = &increment_cl;
  1872. task->buffers[0].handle = token_handle;
  1873. task->buffers[0].mode = STARPU_RW;
  1874. starpu_task_submit(task);
  1875. @}
  1876. @end smallexample
  1877. @end cartouche
  1878. @cartouche
  1879. @smallexample
  1880. int main(int argc, char **argv)
  1881. @{
  1882. int rank, size;
  1883. starpu_init(NULL);
  1884. starpu_mpi_initialize_extended(&rank, &size);
  1885. starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
  1886. unsigned nloops = NITER;
  1887. unsigned loop;
  1888. unsigned last_loop = nloops - 1;
  1889. unsigned last_rank = size - 1;
  1890. @end smallexample
  1891. @end cartouche
  1892. @cartouche
  1893. @smallexample
  1894. for (loop = 0; loop < nloops; loop++) @{
  1895. int tag = loop*size + rank;
  1896. if (loop == 0 && rank == 0)
  1897. @{
  1898. token = 0;
  1899. fprintf(stdout, "Start with token value %d\n", token);
  1900. @}
  1901. else
  1902. @{
  1903. starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
  1904. MPI_COMM_WORLD, NULL, NULL);
  1905. @}
  1906. increment_token();
  1907. if (loop == last_loop && rank == last_rank)
  1908. @{
  1909. starpu_data_acquire(token_handle, STARPU_R);
  1910. fprintf(stdout, "Finished : token value %d\n", token);
  1911. starpu_data_release(token_handle);
  1912. @}
  1913. else
  1914. @{
  1915. starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
  1916. MPI_COMM_WORLD, NULL, NULL);
  1917. @}
  1918. @}
  1919. starpu_task_wait_for_all();
  1920. @end smallexample
  1921. @end cartouche
  1922. @cartouche
  1923. @smallexample
  1924. starpu_mpi_shutdown();
  1925. starpu_shutdown();
  1926. if (rank == last_rank)
  1927. @{
  1928. fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
  1929. STARPU_ASSERT(token == nloops*size);
  1930. @}
  1931. @end smallexample
  1932. @end cartouche
  1933. @page
  1934. @node MPI Insert Task Utility
  1935. @section MPI Insert Task Utility
  1936. @deftypefun void starpu_mpi_insert_task (MPI_Comm @var{comm}, starpu_codelet *@var{cl}, ...)
  1937. Create and submit a task corresponding to @var{cl} with the following
  1938. arguments. The argument list must be zero-terminated.
  1939. The arguments following the codelets are the same types as for the
  1940. function @code{starpu_insert_task} defined in @ref{Insert Task
  1941. Utility}. The extra argument @code{STARPU_EXECUTE_ON_NODE} followed by an
  1942. integer allows to specify the node to execute the codelet. It is also
  1943. possible to specify that the node owning a specific data will execute
  1944. the codelet, by using @code{STARPU_EXECUTE_ON_DATA} followed by a data
  1945. handle.
  1946. The algorithm is as follows:
  1947. @enumerate
  1948. @item Find out whether we are to execute the codelet because we own the
  1949. data to be written to. If different tasks own data to be written to,
  1950. the argument @code{STARPU_EXECUTE_ON_NODE} or
  1951. @code{STARPU_EXECUTE_ON_DATA} should be used to specify the executing
  1952. task @code{ET}.
  1953. @item Send and receive data as requested. Tasks owning data which need
  1954. to be read by the executing task @code{ET} are sending them to @code{ET}.
  1955. @item Execute the codelet. This is done by the task selected in the
  1956. 1st step of the algorithm.
  1957. @item In the case when different tasks own data to be written to, send
  1958. W data back to their owners.
  1959. @end enumerate
  1960. The algorithm also includes a cache mechanism that allows not to send
  1961. data twice to the same task, unless the data has been modified.
  1962. @end deftypefun
  1963. @deftypefun void starpu_mpi_get_data_on_node (MPI_Comm @var{comm}, starpu_data_handle @var{data_handle}, int @var{node})
  1964. @end deftypefun
  1965. @page
  1966. Here an example showing how to use @code{starpu_mpi_insert_task}. One
  1967. first needs to define a distribution function which specifies the
  1968. locality of the data. Note that that distribution information needs to
  1969. be given to StarPU by calling @code{starpu_data_set_rank}.
  1970. @cartouche
  1971. @smallexample
  1972. /* Returns the MPI node number where data is */
  1973. int my_distrib(int x, int y, int nb_nodes) @{
  1974. /* Cyclic distrib */
  1975. return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
  1976. // /* Linear distrib */
  1977. // return x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * X;
  1978. @}
  1979. @end smallexample
  1980. @end cartouche
  1981. Now the data can be registered within StarPU. Data which are not
  1982. owned but will be needed for computations can be registered through
  1983. the lazy allocation mechanism, i.e. with a @code{home_node} set to -1.
  1984. StarPU will automatically allocate the memory when it is used for the
  1985. first time.
  1986. @cartouche
  1987. @smallexample
  1988. unsigned matrix[X][Y];
  1989. starpu_data_handle data_handles[X][Y];
  1990. for(x = 0; x < X; x++) @{
  1991. for (y = 0; y < Y; y++) @{
  1992. int mpi_rank = my_distrib(x, y, size);
  1993. if (mpi_rank == rank)
  1994. /* Owning data */
  1995. starpu_variable_data_register(&data_handles[x][y], 0,
  1996. (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
  1997. else if (rank == mpi_rank+1 || rank == mpi_rank-1)
  1998. /* I don't own that index, but will need it for my computations */
  1999. starpu_variable_data_register(&data_handles[x][y], -1,
  2000. (uintptr_t)NULL, sizeof(unsigned));
  2001. else
  2002. /* I know it's useless to allocate anything for this */
  2003. data_handles[x][y] = NULL;
  2004. if (data_handles[x][y])
  2005. starpu_data_set_rank(data_handles[x][y], mpi_rank);
  2006. @}
  2007. @}
  2008. @end smallexample
  2009. @end cartouche
  2010. Now @code{starpu_mpi_insert_task()} can be called for the different
  2011. steps of the application.
  2012. @cartouche
  2013. @smallexample
  2014. for(loop=0 ; loop<niter; loop++)
  2015. for (x = 1; x < X-1; x++)
  2016. for (y = 1; y < Y-1; y++)
  2017. starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
  2018. STARPU_RW, data_handles[x][y],
  2019. STARPU_R, data_handles[x-1][y],
  2020. STARPU_R, data_handles[x+1][y],
  2021. STARPU_R, data_handles[x][y-1],
  2022. STARPU_R, data_handles[x][y+1],
  2023. 0);
  2024. starpu_task_wait_for_all();
  2025. @end smallexample
  2026. @end cartouche
  2027. @node MPI Collective Operations
  2028. @section MPI Collective Operations
  2029. @deftypefun int starpu_mpi_scatter_detached (starpu_data_handle *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
  2030. Scatter data among processes of the communicator based on the ownership of
  2031. the data. For each data of the array @var{data_handles}, the
  2032. process @var{root} sends the data to the process owning this data.
  2033. Processes receiving data must have valid data handles to receive them.
  2034. @end deftypefun
  2035. @deftypefun int starpu_mpi_gather_detached (starpu_data_handle *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
  2036. Gather data from the different processes of the communicator onto the
  2037. process @var{root}. Each process owning data handle in the array
  2038. @var{data_handles} will send them to the process @var{root}. The
  2039. process @var{root} must have valid data handles to receive the data.
  2040. @end deftypefun
  2041. @page
  2042. @cartouche
  2043. @smallexample
  2044. if (rank == root)
  2045. @{
  2046. /* Allocate the vector */
  2047. vector = malloc(nblocks * sizeof(float *));
  2048. for(x=0 ; x<nblocks ; x++)
  2049. @{
  2050. starpu_malloc((void **)&vector[x], block_size*sizeof(float));
  2051. @}
  2052. @}
  2053. /* Allocate data handles and register data to StarPU */
  2054. data_handles = malloc(nblocks*sizeof(starpu_data_handle *));
  2055. for(x = 0; x < nblocks ; x++)
  2056. @{
  2057. int mpi_rank = my_distrib(x, nodes);
  2058. if (rank == root) @{
  2059. starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
  2060. blocks_size, sizeof(float));
  2061. @}
  2062. else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{
  2063. /* I own that index, or i will need it for my computations */
  2064. starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
  2065. block_size, sizeof(float));
  2066. @}
  2067. else @{
  2068. /* I know it's useless to allocate anything for this */
  2069. data_handles[x] = NULL;
  2070. @}
  2071. if (data_handles[x]) @{
  2072. starpu_data_set_rank(data_handles[x], mpi_rank);
  2073. @}
  2074. @}
  2075. /* Scatter the matrix among the nodes */
  2076. starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
  2077. /* Calculation */
  2078. for(x = 0; x < nblocks ; x++) @{
  2079. if (data_handles[x]) @{
  2080. int owner = starpu_data_get_rank(data_handles[x]);
  2081. if (owner == rank) @{
  2082. starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
  2083. @}
  2084. @}
  2085. @}
  2086. /* Gather the matrix on main node */
  2087. starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
  2088. @end smallexample
  2089. @end cartouche
  2090. @c ---------------------------------------------------------------------
  2091. @c Configuration options
  2092. @c ---------------------------------------------------------------------
  2093. @node Configuring StarPU
  2094. @chapter Configuring StarPU
  2095. @menu
  2096. * Compilation configuration::
  2097. * Execution configuration through environment variables::
  2098. @end menu
  2099. @node Compilation configuration
  2100. @section Compilation configuration
  2101. The following arguments can be given to the @code{configure} script.
  2102. @menu
  2103. * Common configuration::
  2104. * Configuring workers::
  2105. * Advanced configuration::
  2106. @end menu
  2107. @node Common configuration
  2108. @subsection Common configuration
  2109. @menu
  2110. * --enable-debug::
  2111. * --enable-fast::
  2112. * --enable-verbose::
  2113. * --enable-coverage::
  2114. @end menu
  2115. @node --enable-debug
  2116. @subsubsection @code{--enable-debug}
  2117. @table @asis
  2118. @item @emph{Description}:
  2119. Enable debugging messages.
  2120. @end table
  2121. @node --enable-fast
  2122. @subsubsection @code{--enable-fast}
  2123. @table @asis
  2124. @item @emph{Description}:
  2125. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  2126. @end table
  2127. @node --enable-verbose
  2128. @subsubsection @code{--enable-verbose}
  2129. @table @asis
  2130. @item @emph{Description}:
  2131. Augment the verbosity of the debugging messages. This can be disabled
  2132. at runtime by setting the environment variable @code{STARPU_SILENT} to
  2133. any value.
  2134. @smallexample
  2135. % STARPU_SILENT=1 ./vector_scal
  2136. @end smallexample
  2137. @end table
  2138. @node --enable-coverage
  2139. @subsubsection @code{--enable-coverage}
  2140. @table @asis
  2141. @item @emph{Description}:
  2142. Enable flags for the @code{gcov} coverage tool.
  2143. @end table
  2144. @node Configuring workers
  2145. @subsection Configuring workers
  2146. @menu
  2147. * --enable-maxcpus::
  2148. * --disable-cpu::
  2149. * --enable-maxcudadev::
  2150. * --disable-cuda::
  2151. * --with-cuda-dir::
  2152. * --with-cuda-include-dir::
  2153. * --with-cuda-lib-dir::
  2154. * --disable-cuda-memcpy-peer::
  2155. * --enable-maxopencldev::
  2156. * --disable-opencl::
  2157. * --with-opencl-dir::
  2158. * --with-opencl-include-dir::
  2159. * --with-opencl-lib-dir::
  2160. * --enable-gordon::
  2161. * --with-gordon-dir::
  2162. * --enable-maximplementations::
  2163. @end menu
  2164. @node --enable-maxcpus
  2165. @subsubsection @code{--enable-maxcpus=<number>}
  2166. @table @asis
  2167. @item @emph{Description}:
  2168. Defines the maximum number of CPU cores that StarPU will support, then
  2169. available as the @code{STARPU_MAXCPUS} macro.
  2170. @end table
  2171. @node --disable-cpu
  2172. @subsubsection @code{--disable-cpu}
  2173. @table @asis
  2174. @item @emph{Description}:
  2175. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  2176. @end table
  2177. @node --enable-maxcudadev
  2178. @subsubsection @code{--enable-maxcudadev=<number>}
  2179. @table @asis
  2180. @item @emph{Description}:
  2181. Defines the maximum number of CUDA devices that StarPU will support, then
  2182. available as the @code{STARPU_MAXCUDADEVS} macro.
  2183. @end table
  2184. @node --disable-cuda
  2185. @subsubsection @code{--disable-cuda}
  2186. @table @asis
  2187. @item @emph{Description}:
  2188. Disable the use of CUDA, even if a valid CUDA installation was detected.
  2189. @end table
  2190. @node --with-cuda-dir
  2191. @subsubsection @code{--with-cuda-dir=<path>}
  2192. @table @asis
  2193. @item @emph{Description}:
  2194. Specify the directory where CUDA is installed. This directory should notably contain
  2195. @code{include/cuda.h}.
  2196. @end table
  2197. @node --with-cuda-include-dir
  2198. @subsubsection @code{--with-cuda-include-dir=<path>}
  2199. @table @asis
  2200. @item @emph{Description}:
  2201. Specify the directory where CUDA headers are installed. This directory should
  2202. notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
  2203. value given to @code{--with-cuda-dir}.
  2204. @end table
  2205. @node --with-cuda-lib-dir
  2206. @subsubsection @code{--with-cuda-lib-dir=<path>}
  2207. @table @asis
  2208. @item @emph{Description}:
  2209. Specify the directory where the CUDA library is installed. This directory should
  2210. notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
  2211. @code{/lib} appended to the value given to @code{--with-cuda-dir}.
  2212. @end table
  2213. @node --disable-cuda-memcpy-peer
  2214. @subsubsection @code{--disable-cuda-memcpy-peer}
  2215. @table @asis
  2216. @item @emph{Description}
  2217. Explicitely disables peer transfers when using CUDA 4.0
  2218. @end table
  2219. @node --enable-maxopencldev
  2220. @subsubsection @code{--enable-maxopencldev=<number>}
  2221. @table @asis
  2222. @item @emph{Description}:
  2223. Defines the maximum number of OpenCL devices that StarPU will support, then
  2224. available as the @code{STARPU_MAXOPENCLDEVS} macro.
  2225. @end table
  2226. @node --disable-opencl
  2227. @subsubsection @code{--disable-opencl}
  2228. @table @asis
  2229. @item @emph{Description}:
  2230. Disable the use of OpenCL, even if the SDK is detected.
  2231. @end table
  2232. @node --with-opencl-dir
  2233. @subsubsection @code{--with-opencl-dir=<path>}
  2234. @table @asis
  2235. @item @emph{Description}:
  2236. Specify the location of the OpenCL SDK. This directory should notably contain
  2237. @code{include/CL/cl.h} (or @code{include/OpenCL/cl.h} on Mac OS).
  2238. @end table
  2239. @node --with-opencl-include-dir
  2240. @subsubsection @code{--with-opencl-include-dir=<path>}
  2241. @table @asis
  2242. @item @emph{Description}:
  2243. Specify the location of OpenCL headers. This directory should notably contain
  2244. @code{CL/cl.h} (or @code{OpenCL/cl.h} on Mac OS). This defaults to
  2245. @code{/include} appended to the value given to @code{--with-opencl-dir}.
  2246. @end table
  2247. @node --with-opencl-lib-dir
  2248. @subsubsection @code{--with-opencl-lib-dir=<path>}
  2249. @table @asis
  2250. @item @emph{Description}:
  2251. Specify the location of the OpenCL library. This directory should notably
  2252. contain the OpenCL shared libraries (e.g. libOpenCL.so). This defaults to
  2253. @code{/lib} appended to the value given to @code{--with-opencl-dir}.
  2254. @end table
  2255. @node --enable-gordon
  2256. @subsubsection @code{--enable-gordon}
  2257. @table @asis
  2258. @item @emph{Description}:
  2259. Enable the use of the Gordon runtime for Cell SPUs.
  2260. @c TODO: rather default to enabled when detected
  2261. @end table
  2262. @node --with-gordon-dir
  2263. @subsubsection @code{--with-gordon-dir=<path>}
  2264. @table @asis
  2265. @item @emph{Description}:
  2266. Specify the location of the Gordon SDK.
  2267. @end table
  2268. @node --enable-maximplementations
  2269. @subsubsection @code{--enable-maximplementations=<number>}
  2270. @table @asis
  2271. @item @emph{Description}:
  2272. Defines the number of implementations that can be defined for a single kind of
  2273. device. It is then available as the @code{STARPU_MAXIMPLEMENTATIONS} macro.
  2274. @end table
  2275. @node Advanced configuration
  2276. @subsection Advanced configuration
  2277. @menu
  2278. * --enable-perf-debug::
  2279. * --enable-model-debug::
  2280. * --enable-stats::
  2281. * --enable-maxbuffers::
  2282. * --enable-allocation-cache::
  2283. * --enable-opengl-render::
  2284. * --enable-blas-lib::
  2285. * --with-magma::
  2286. * --with-fxt::
  2287. * --with-perf-model-dir::
  2288. * --with-mpicc::
  2289. * --with-goto-dir::
  2290. * --with-atlas-dir::
  2291. * --with-mkl-cflags::
  2292. * --with-mkl-ldflags::
  2293. @end menu
  2294. @node --enable-perf-debug
  2295. @subsubsection @code{--enable-perf-debug}
  2296. @table @asis
  2297. @item @emph{Description}:
  2298. Enable performance debugging through gprof.
  2299. @end table
  2300. @node --enable-model-debug
  2301. @subsubsection @code{--enable-model-debug}
  2302. @table @asis
  2303. @item @emph{Description}:
  2304. Enable performance model debugging.
  2305. @end table
  2306. @node --enable-stats
  2307. @subsubsection @code{--enable-stats}
  2308. @table @asis
  2309. @item @emph{Description}:
  2310. Enable statistics.
  2311. @end table
  2312. @node --enable-maxbuffers
  2313. @subsubsection @code{--enable-maxbuffers=<nbuffers>}
  2314. @table @asis
  2315. @item @emph{Description}:
  2316. Define the maximum number of buffers that tasks will be able to take
  2317. as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
  2318. @end table
  2319. @node --enable-allocation-cache
  2320. @subsubsection @code{--enable-allocation-cache}
  2321. @table @asis
  2322. @item @emph{Description}:
  2323. Enable the use of a data allocation cache to avoid the cost of it with
  2324. CUDA. Still experimental.
  2325. @end table
  2326. @node --enable-opengl-render
  2327. @subsubsection @code{--enable-opengl-render}
  2328. @table @asis
  2329. @item @emph{Description}:
  2330. Enable the use of OpenGL for the rendering of some examples.
  2331. @c TODO: rather default to enabled when detected
  2332. @end table
  2333. @node --enable-blas-lib
  2334. @subsubsection @code{--enable-blas-lib=<name>}
  2335. @table @asis
  2336. @item @emph{Description}:
  2337. Specify the blas library to be used by some of the examples. The
  2338. library has to be 'atlas' or 'goto'.
  2339. @end table
  2340. @node --with-magma
  2341. @subsubsection @code{--with-magma=<path>}
  2342. @table @asis
  2343. @item @emph{Description}:
  2344. Specify where magma is installed. This directory should notably contain
  2345. @code{include/magmablas.h}.
  2346. @end table
  2347. @node --with-fxt
  2348. @subsubsection @code{--with-fxt=<path>}
  2349. @table @asis
  2350. @item @emph{Description}:
  2351. Specify the location of FxT (for generating traces and rendering them
  2352. using ViTE). This directory should notably contain
  2353. @code{include/fxt/fxt.h}.
  2354. @c TODO add ref to other section
  2355. @end table
  2356. @node --with-perf-model-dir
  2357. @subsubsection @code{--with-perf-model-dir=<dir>}
  2358. @table @asis
  2359. @item @emph{Description}:
  2360. Specify where performance models should be stored (instead of defaulting to the
  2361. current user's home).
  2362. @end table
  2363. @node --with-mpicc
  2364. @subsubsection @code{--with-mpicc=<path to mpicc>}
  2365. @table @asis
  2366. @item @emph{Description}:
  2367. Specify the location of the @code{mpicc} compiler to be used for starpumpi.
  2368. @end table
  2369. @node --with-goto-dir
  2370. @subsubsection @code{--with-goto-dir=<dir>}
  2371. @table @asis
  2372. @item @emph{Description}:
  2373. Specify the location of GotoBLAS.
  2374. @end table
  2375. @node --with-atlas-dir
  2376. @subsubsection @code{--with-atlas-dir=<dir>}
  2377. @table @asis
  2378. @item @emph{Description}:
  2379. Specify the location of ATLAS. This directory should notably contain
  2380. @code{include/cblas.h}.
  2381. @end table
  2382. @node --with-mkl-cflags
  2383. @subsubsection @code{--with-mkl-cflags=<cflags>}
  2384. @table @asis
  2385. @item @emph{Description}:
  2386. Specify the compilation flags for the MKL Library.
  2387. @end table
  2388. @node --with-mkl-ldflags
  2389. @subsubsection @code{--with-mkl-ldflags=<ldflags>}
  2390. @table @asis
  2391. @item @emph{Description}:
  2392. Specify the linking flags for the MKL Library. Note that the
  2393. @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/}
  2394. website provides a script to determine the linking flags.
  2395. @end table
  2396. @c ---------------------------------------------------------------------
  2397. @c Environment variables
  2398. @c ---------------------------------------------------------------------
  2399. @node Execution configuration through environment variables
  2400. @section Execution configuration through environment variables
  2401. @menu
  2402. * Workers:: Configuring workers
  2403. * Scheduling:: Configuring the Scheduling engine
  2404. * Misc:: Miscellaneous and debug
  2405. @end menu
  2406. Note: the values given in @code{starpu_conf} structure passed when
  2407. calling @code{starpu_init} will override the values of the environment
  2408. variables.
  2409. @node Workers
  2410. @subsection Configuring workers
  2411. @menu
  2412. * STARPU_NCPUS:: Number of CPU workers
  2413. * STARPU_NCUDA:: Number of CUDA workers
  2414. * STARPU_NOPENCL:: Number of OpenCL workers
  2415. * STARPU_NGORDON:: Number of SPU workers (Cell)
  2416. * STARPU_WORKERS_CPUID:: Bind workers to specific CPUs
  2417. * STARPU_WORKERS_CUDAID:: Select specific CUDA devices
  2418. * STARPU_WORKERS_OPENCLID:: Select specific OpenCL devices
  2419. @end menu
  2420. @node STARPU_NCPUS
  2421. @subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
  2422. @table @asis
  2423. @item @emph{Description}:
  2424. Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
  2425. more CPU workers than there are physical CPUs, and that some CPUs are used to control
  2426. the accelerators.
  2427. @end table
  2428. @node STARPU_NCUDA
  2429. @subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
  2430. @table @asis
  2431. @item @emph{Description}:
  2432. Specify the number of CUDA devices that StarPU can use. If
  2433. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  2434. possible to select which CUDA devices should be used by the means of the
  2435. @code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
  2436. create as many CUDA workers as there are CUDA devices.
  2437. @end table
  2438. @node STARPU_NOPENCL
  2439. @subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  2440. @table @asis
  2441. @item @emph{Description}:
  2442. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  2443. @end table
  2444. @node STARPU_NGORDON
  2445. @subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  2446. @table @asis
  2447. @item @emph{Description}:
  2448. Specify the number of SPUs that StarPU can use.
  2449. @end table
  2450. @node STARPU_WORKERS_CPUID
  2451. @subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  2452. @table @asis
  2453. @item @emph{Description}:
  2454. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  2455. specifies on which logical CPU the different workers should be
  2456. bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
  2457. worker will be bound to logical CPU #0, the second CPU worker will be bound to
  2458. logical CPU #1 and so on. Note that the logical ordering of the CPUs is either
  2459. determined by the OS, or provided by the @code{hwloc} library in case it is
  2460. available.
  2461. Note that the first workers correspond to the CUDA workers, then come the
  2462. OpenCL and the SPU, and finally the CPU workers. For example if
  2463. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  2464. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  2465. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  2466. the logical CPUs #1 and #3 will be used by the CPU workers.
  2467. If the number of workers is larger than the array given in
  2468. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  2469. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  2470. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  2471. This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
  2472. @code{starpu_conf} structure passed to @code{starpu_init} is set.
  2473. @end table
  2474. @node STARPU_WORKERS_CUDAID
  2475. @subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  2476. @table @asis
  2477. @item @emph{Description}:
  2478. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  2479. possible to select which CUDA devices should be used by StarPU. On a machine
  2480. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  2481. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  2482. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  2483. the one reported by CUDA).
  2484. This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
  2485. the @code{starpu_conf} structure passed to @code{starpu_init} is set.
  2486. @end table
  2487. @node STARPU_WORKERS_OPENCLID
  2488. @subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  2489. @table @asis
  2490. @item @emph{Description}:
  2491. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  2492. This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
  2493. the @code{starpu_conf} structure passed to @code{starpu_init} is set.
  2494. @end table
  2495. @node Scheduling
  2496. @subsection Configuring the Scheduling engine
  2497. @menu
  2498. * STARPU_SCHED:: Scheduling policy
  2499. * STARPU_CALIBRATE:: Calibrate performance models
  2500. * STARPU_PREFETCH:: Use data prefetch
  2501. * STARPU_SCHED_ALPHA:: Computation factor
  2502. * STARPU_SCHED_BETA:: Communication factor
  2503. @end menu
  2504. @node STARPU_SCHED
  2505. @subsubsection @code{STARPU_SCHED} -- Scheduling policy
  2506. @table @asis
  2507. @item @emph{Description}:
  2508. This chooses between the different scheduling policies proposed by StarPU: work
  2509. random, stealing, greedy, with performance models, etc.
  2510. Use @code{STARPU_SCHED=help} to get the list of available schedulers.
  2511. @end table
  2512. @node STARPU_CALIBRATE
  2513. @subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  2514. @table @asis
  2515. @item @emph{Description}:
  2516. If this variable is set to 1, the performance models are calibrated during
  2517. the execution. If it is set to 2, the previous values are dropped to restart
  2518. calibration from scratch. Setting this variable to 0 disable calibration, this
  2519. is the default behaviour.
  2520. Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
  2521. @end table
  2522. @node STARPU_PREFETCH
  2523. @subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
  2524. @table @asis
  2525. @item @emph{Description}:
  2526. This variable indicates whether data prefetching should be enabled (0 means
  2527. that it is disabled). If prefetching is enabled, when a task is scheduled to be
  2528. executed e.g. on a GPU, StarPU will request an asynchronous transfer in
  2529. advance, so that data is already present on the GPU when the task starts. As a
  2530. result, computation and data transfers are overlapped.
  2531. Note that prefetching is enabled by default in StarPU.
  2532. @end table
  2533. @node STARPU_SCHED_ALPHA
  2534. @subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  2535. @table @asis
  2536. @item @emph{Description}:
  2537. To estimate the cost of a task StarPU takes into account the estimated
  2538. computation time (obtained thanks to performance models). The alpha factor is
  2539. the coefficient to be applied to it before adding it to the communication part.
  2540. @end table
  2541. @node STARPU_SCHED_BETA
  2542. @subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
  2543. @table @asis
  2544. @item @emph{Description}:
  2545. To estimate the cost of a task StarPU takes into account the estimated
  2546. data transfer time (obtained thanks to performance models). The beta factor is
  2547. the coefficient to be applied to it before adding it to the computation part.
  2548. @end table
  2549. @node Misc
  2550. @subsection Miscellaneous and debug
  2551. @menu
  2552. * STARPU_SILENT:: Disable verbose mode
  2553. * STARPU_LOGFILENAME:: Select debug file name
  2554. * STARPU_FXT_PREFIX:: FxT trace location
  2555. * STARPU_LIMIT_GPU_MEM:: Restrict memory size on the GPUs
  2556. * STARPU_GENERATE_TRACE:: Generate a Paje trace when StarPU is shut down
  2557. @end menu
  2558. @node STARPU_SILENT
  2559. @subsubsection @code{STARPU_SILENT} -- Disable verbose mode
  2560. @table @asis
  2561. @item @emph{Description}:
  2562. This variable allows to disable verbose mode at runtime when StarPU
  2563. has been configured with the option @code{--enable-verbose}.
  2564. @end table
  2565. @node STARPU_LOGFILENAME
  2566. @subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
  2567. @table @asis
  2568. @item @emph{Description}:
  2569. This variable specifies in which file the debugging output should be saved to.
  2570. @end table
  2571. @node STARPU_FXT_PREFIX
  2572. @subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
  2573. @table @asis
  2574. @item @emph{Description}
  2575. This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
  2576. @end table
  2577. @node STARPU_LIMIT_GPU_MEM
  2578. @subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
  2579. @table @asis
  2580. @item @emph{Description}
  2581. This variable specifies the maximum number of megabytes that should be
  2582. available to the application on each GPUs. In case this value is smaller than
  2583. the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
  2584. on the device. This variable is intended to be used for experimental purposes
  2585. as it emulates devices that have a limited amount of memory.
  2586. @end table
  2587. @node STARPU_GENERATE_TRACE
  2588. @subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
  2589. @table @asis
  2590. @item @emph{Description}
  2591. When set to 1, this variable indicates that StarPU should automatically
  2592. generate a Paje trace when starpu_shutdown is called.
  2593. @end table
  2594. @c ---------------------------------------------------------------------
  2595. @c StarPU API
  2596. @c ---------------------------------------------------------------------
  2597. @node StarPU API
  2598. @chapter StarPU API
  2599. @menu
  2600. * Initialization and Termination:: Initialization and Termination methods
  2601. * Workers' Properties:: Methods to enumerate workers' properties
  2602. * Data Library:: Methods to manipulate data
  2603. * Data Interfaces::
  2604. * Data Partition::
  2605. * Codelets and Tasks:: Methods to construct tasks
  2606. * Explicit Dependencies:: Explicit Dependencies
  2607. * Implicit Data Dependencies:: Implicit Data Dependencies
  2608. * Performance Model API::
  2609. * Profiling API:: Profiling API
  2610. * CUDA extensions:: CUDA extensions
  2611. * OpenCL extensions:: OpenCL extensions
  2612. * Cell extensions:: Cell extensions
  2613. * Miscellaneous helpers::
  2614. @end menu
  2615. @node Initialization and Termination
  2616. @section Initialization and Termination
  2617. @menu
  2618. * starpu_init:: Initialize StarPU
  2619. * struct starpu_conf:: StarPU runtime configuration
  2620. * starpu_conf_init:: Initialize starpu_conf structure
  2621. * starpu_shutdown:: Terminate StarPU
  2622. @end menu
  2623. @node starpu_init
  2624. @subsection @code{starpu_init} -- Initialize StarPU
  2625. @table @asis
  2626. @item @emph{Description}:
  2627. This is StarPU initialization method, which must be called prior to any other
  2628. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  2629. policy, number of cores, ...) by passing a non-null argument. Default
  2630. configuration is used if the passed argument is @code{NULL}.
  2631. @item @emph{Return value}:
  2632. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  2633. indicates that no worker was available (so that StarPU was not initialized).
  2634. @item @emph{Prototype}:
  2635. @code{int starpu_init(struct starpu_conf *conf);}
  2636. @end table
  2637. @node struct starpu_conf
  2638. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  2639. @table @asis
  2640. @item @emph{Description}:
  2641. This structure is passed to the @code{starpu_init} function in order
  2642. to configure StarPU.
  2643. When the default value is used, StarPU automatically selects the number
  2644. of processing units and takes the default scheduling policy. This parameter
  2645. overwrites the equivalent environment variables.
  2646. @item @emph{Fields}:
  2647. @table @asis
  2648. @item @code{sched_policy_name} (default = NULL):
  2649. This is the name of the scheduling policy. This can also be specified with the
  2650. @code{STARPU_SCHED} environment variable.
  2651. @item @code{sched_policy} (default = NULL):
  2652. This is the definition of the scheduling policy. This field is ignored
  2653. if @code{sched_policy_name} is set.
  2654. @item @code{ncpus} (default = -1):
  2655. This is the number of CPU cores that StarPU can use. This can also be
  2656. specified with the @code{STARPU_NCPUS} environment variable.
  2657. @item @code{ncuda} (default = -1):
  2658. This is the number of CUDA devices that StarPU can use. This can also be
  2659. specified with the @code{STARPU_NCUDA} environment variable.
  2660. @item @code{nopencl} (default = -1):
  2661. This is the number of OpenCL devices that StarPU can use. This can also be
  2662. specified with the @code{STARPU_NOPENCL} environment variable.
  2663. @item @code{nspus} (default = -1):
  2664. This is the number of Cell SPUs that StarPU can use. This can also be
  2665. specified with the @code{STARPU_NGORDON} environment variable.
  2666. @item @code{use_explicit_workers_bindid} (default = 0)
  2667. If this flag is set, the @code{workers_bindid} array indicates where the
  2668. different workers are bound, otherwise StarPU automatically selects where to
  2669. bind the different workers unless the @code{STARPU_WORKERS_CPUID} environment
  2670. variable is set. The @code{STARPU_WORKERS_CPUID} environment variable is
  2671. ignored if the @code{use_explicit_workers_bindid} flag is set.
  2672. @item @code{workers_bindid[STARPU_NMAXWORKERS]}
  2673. If the @code{use_explicit_workers_bindid} flag is set, this array indicates
  2674. where to bind the different workers. The i-th entry of the
  2675. @code{workers_bindid} indicates the logical identifier of the processor which
  2676. should execute the i-th worker. Note that the logical ordering of the CPUs is
  2677. either determined by the OS, or provided by the @code{hwloc} library in case it
  2678. is available.
  2679. When this flag is set, the @ref{STARPU_WORKERS_CPUID} environment variable is
  2680. ignored.
  2681. @item @code{use_explicit_workers_cuda_gpuid} (default = 0)
  2682. If this flag is set, the CUDA workers will be attached to the CUDA devices
  2683. specified in the @code{workers_cuda_gpuid} array. Otherwise, StarPU affects the
  2684. CUDA devices in a round-robin fashion.
  2685. When this flag is set, the @ref{STARPU_WORKERS_CUDAID} environment variable is
  2686. ignored.
  2687. @item @code{workers_cuda_gpuid[STARPU_NMAXWORKERS]}
  2688. If the @code{use_explicit_workers_cuda_gpuid} flag is set, this array contains
  2689. the logical identifiers of the CUDA devices (as used by @code{cudaGetDevice}).
  2690. @item @code{use_explicit_workers_opencl_gpuid} (default = 0)
  2691. If this flag is set, the OpenCL workers will be attached to the OpenCL devices
  2692. specified in the @code{workers_opencl_gpuid} array. Otherwise, StarPU affects the
  2693. OpenCL devices in a round-robin fashion.
  2694. @item @code{workers_opencl_gpuid[STARPU_NMAXWORKERS]}:
  2695. @item @code{calibrate} (default = 0):
  2696. If this flag is set, StarPU will calibrate the performance models when
  2697. executing tasks. If this value is equal to -1, the default value is used. The
  2698. default value is overwritten by the @code{STARPU_CALIBRATE} environment
  2699. variable when it is set.
  2700. @end table
  2701. @item @code{single_combined_worker} (default = 0):
  2702. By default, StarPU creates various combined workers according to the machine
  2703. structure. Some parallel libraries (e.g. most OpenMP implementations) however do
  2704. not support concurrent calls to parallel code. In such case, setting this flag
  2705. makes StarPU only create one combined worker, containing all
  2706. the CPU workers. The default value is overwritten by the
  2707. @code{STARPU_SINGLE_COMBINED_WORKER} environment variable when it is set.
  2708. @end table
  2709. @node starpu_conf_init
  2710. @subsection @code{starpu_conf_init} -- Initialize starpu_conf structure
  2711. @table @asis
  2712. This function initializes the @code{starpu_conf} structure passed as argument
  2713. with the default values. In case some configuration parameters are already
  2714. specified through environment variables, @code{starpu_conf_init} initializes
  2715. the fields of the structure according to the environment variables. For
  2716. instance if @code{STARPU_CALIBRATE} is set, its value is put in the
  2717. @code{.ncuda} field of the structure passed as argument.
  2718. @item @emph{Return value}:
  2719. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  2720. indicates that the argument was NULL.
  2721. @item @emph{Prototype}:
  2722. @code{int starpu_conf_init(struct starpu_conf *conf);}
  2723. @end table
  2724. @node starpu_shutdown
  2725. @subsection @code{starpu_shutdown} -- Terminate StarPU
  2726. @deftypefun void starpu_shutdown (void)
  2727. This is StarPU termination method. It must be called at the end of the
  2728. application: statistics and other post-mortem debugging information are not
  2729. guaranteed to be available until this method has been called.
  2730. @end deftypefun
  2731. @node Workers' Properties
  2732. @section Workers' Properties
  2733. @menu
  2734. * starpu_worker_get_count:: Get the number of processing units
  2735. * starpu_worker_get_count_by_type:: Get the number of processing units of a given type
  2736. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  2737. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  2738. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  2739. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  2740. * starpu_worker_get_id:: Get the identifier of the current worker
  2741. * starpu_worker_get_ids_by_type:: Get the list of identifiers of workers with a given type
  2742. * starpu_worker_get_devid:: Get the device identifier of a worker
  2743. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  2744. * starpu_worker_get_name:: Get the name of a worker
  2745. * starpu_worker_get_memory_node:: Get the memory node of a worker
  2746. @end menu
  2747. @node starpu_worker_get_count
  2748. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  2749. @deftypefun unsigned starpu_worker_get_count (void)
  2750. This function returns the number of workers (i.e. processing units executing
  2751. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  2752. @end deftypefun
  2753. @node starpu_worker_get_count_by_type
  2754. @subsection @code{starpu_worker_get_count_by_type} -- Get the number of processing units of a given type
  2755. @deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})
  2756. Returns the number of workers of the type indicated by the argument. A positive
  2757. (or null) value is returned in case of success, @code{-EINVAL} indicates that
  2758. the type is not valid otherwise.
  2759. @end deftypefun
  2760. @node starpu_cpu_worker_get_count
  2761. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  2762. @deftypefun unsigned starpu_cpu_worker_get_count (void)
  2763. This function returns the number of CPUs controlled by StarPU. The returned
  2764. value should be at most @code{STARPU_MAXCPUS}.
  2765. @end deftypefun
  2766. @node starpu_cuda_worker_get_count
  2767. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  2768. @deftypefun unsigned starpu_cuda_worker_get_count (void)
  2769. This function returns the number of CUDA devices controlled by StarPU. The returned
  2770. value should be at most @code{STARPU_MAXCUDADEVS}.
  2771. @end deftypefun
  2772. @node starpu_opencl_worker_get_count
  2773. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  2774. @deftypefun unsigned starpu_opencl_worker_get_count (void)
  2775. This function returns the number of OpenCL devices controlled by StarPU. The returned
  2776. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  2777. @end deftypefun
  2778. @node starpu_spu_worker_get_count
  2779. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  2780. @deftypefun unsigned starpu_spu_worker_get_count (void)
  2781. This function returns the number of Cell SPUs controlled by StarPU.
  2782. @end deftypefun
  2783. @node starpu_worker_get_id
  2784. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  2785. @deftypefun int starpu_worker_get_id (void)
  2786. This function returns the identifier of the worker associated to the calling
  2787. thread. The returned value is either -1 if the current context is not a StarPU
  2788. worker (i.e. when called from the application outside a task or a callback), or
  2789. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  2790. @end deftypefun
  2791. @node starpu_worker_get_ids_by_type
  2792. @subsection @code{starpu_worker_get_ids_by_type} -- Get the list of identifiers of workers with a given type
  2793. @deftypefun int starpu_worker_get_ids_by_type ({enum starpu_archtype} @var{type}, int *@var{workerids}, int @var{maxsize})
  2794. Fill the workerids array with the identifiers of the workers that have the type
  2795. indicated in the first argument. The maxsize argument indicates the size of the
  2796. workids array. The returned value gives the number of identifiers that were put
  2797. in the array. @code{-ERANGE} is returned is maxsize is lower than the number of
  2798. workers with the appropriate type: in that case, the array is filled with the
  2799. maxsize first elements. To avoid such overflows, the value of maxsize can be
  2800. chosen by the means of the @code{starpu_worker_get_count_by_type} function, or
  2801. by passing a value greater or equal to @code{STARPU_NMAXWORKERS}.
  2802. @end deftypefun
  2803. @node starpu_worker_get_devid
  2804. @subsection @code{starpu_worker_get_devid} -- Get the device identifier of a worker
  2805. @deftypefun int starpu_worker_get_devid (int @var{id})
  2806. This functions returns the device id of the worker associated to an identifier
  2807. (as returned by the @code{starpu_worker_get_id} function). In the case of a
  2808. CUDA worker, this device identifier is the logical device identifier exposed by
  2809. CUDA (used by the @code{cudaGetDevice} function for instance). The device
  2810. identifier of a CPU worker is the logical identifier of the core on which the
  2811. worker was bound; this identifier is either provided by the OS or by the
  2812. @code{hwloc} library in case it is available.
  2813. @end deftypefun
  2814. @node starpu_worker_get_type
  2815. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  2816. @deftypefun {enum starpu_archtype} starpu_worker_get_type (int @var{id})
  2817. This function returns the type of worker associated to an identifier (as
  2818. returned by the @code{starpu_worker_get_id} function). The returned value
  2819. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  2820. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  2821. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  2822. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  2823. identifier is unspecified.
  2824. @end deftypefun
  2825. @node starpu_worker_get_name
  2826. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  2827. @deftypefun void starpu_worker_get_name (int @var{id}, char *@var{dst}, size_t @var{maxlen})
  2828. StarPU associates a unique human readable string to each processing unit. This
  2829. function copies at most the @var{maxlen} first bytes of the unique string
  2830. associated to a worker identified by its identifier @var{id} into the
  2831. @var{dst} buffer. The caller is responsible for ensuring that the @var{dst}
  2832. is a valid pointer to a buffer of @var{maxlen} bytes at least. Calling this
  2833. function on an invalid identifier results in an unspecified behaviour.
  2834. @end deftypefun
  2835. @node starpu_worker_get_memory_node
  2836. @subsection @code{starpu_worker_get_memory_node} -- Get the memory node of a worker
  2837. @deftypefun unsigned starpu_worker_get_memory_node (unsigned @var{workerid})
  2838. This function returns the identifier of the memory node associated to the
  2839. worker identified by @var{workerid}.
  2840. @end deftypefun
  2841. @node Data Library
  2842. @section Data Library
  2843. This section describes the data management facilities provided by StarPU.
  2844. We show how to use existing data interfaces in @ref{Data Interfaces}, but developers can
  2845. design their own data interfaces if required.
  2846. @menu
  2847. * starpu_malloc:: Allocate data and pin it
  2848. * starpu_access_mode:: Data access mode
  2849. * unsigned memory_node:: Memory node
  2850. * starpu_data_handle:: StarPU opaque data handle
  2851. * void *interface:: StarPU data interface
  2852. * starpu_data_register:: Register a piece of data to StarPU
  2853. * starpu_data_unregister:: Unregister a piece of data from StarPU
  2854. * starpu_data_unregister_no_coherency:: Unregister a piece of data from StarPU without coherency
  2855. * starpu_data_invalidate:: Invalidate all data replicates
  2856. * starpu_data_acquire:: Access registered data from the application
  2857. * starpu_data_acquire_cb:: Access registered data from the application asynchronously
  2858. * STARPU_DATA_ACQUIRE_CB:: Access registered data from the application asynchronously, macro
  2859. * starpu_data_release:: Release registered data from the application
  2860. * starpu_data_set_wt_mask:: Set the Write-Through mask
  2861. * starpu_data_prefetch_on_node:: Prefetch data to a given node
  2862. @end menu
  2863. @node starpu_malloc
  2864. @subsection @code{starpu_malloc} -- Allocate data and pin it
  2865. @deftypefun int starpu_malloc (void **@var{A}, size_t @var{dim})
  2866. This function allocates data of the given size in main memory. It will also try to pin it in
  2867. CUDA or OpenCL, so that data transfers from this buffer can be asynchronous, and
  2868. thus permit data transfer and computation overlapping. The allocated buffer must
  2869. be freed thanks to the @code{starpu_free} function.
  2870. @end deftypefun
  2871. @node starpu_access_mode
  2872. @subsection @code{starpu_access_mode} -- Data access mode
  2873. This datatype describes a data access mode. The different available modes are:
  2874. @table @asis
  2875. @table @asis
  2876. @item @code{STARPU_R} read-only mode.
  2877. @item @code{STARPU_W} write-only mode.
  2878. @item @code{STARPU_RW} read-write mode. This is equivalent to @code{STARPU_R|STARPU_W}.
  2879. @item @code{STARPU_SCRATCH} scratch memory. A temporary buffer is allocated for the task, but StarPU does not enforce data consistency, i.e. each device has its own buffer, independently from each other (even for CPUs). This is useful for temporary variables. For now, no behaviour is defined concerning the relation with STARPU_R/W modes and the value provided at registration, i.e. the value of the scratch buffer is undefined at entry of the codelet function, but this is being considered for future extensions.
  2880. @item @code{STARPU_REDUX} reduction mode. TODO: document, as well as @code{starpu_data_set_reduction_methods}
  2881. @end table
  2882. @end table
  2883. @node unsigned memory_node
  2884. @subsection @code{unsigned memory_node} -- Memory node
  2885. @table @asis
  2886. @item @emph{Description}:
  2887. Every worker is associated to a memory node which is a logical abstraction of
  2888. the address space from which the processing unit gets its data. For instance,
  2889. the memory node associated to the different CPU workers represents main memory
  2890. (RAM), the memory node associated to a GPU is DRAM embedded on the device.
  2891. Every memory node is identified by a logical index which is accessible from the
  2892. @code{starpu_worker_get_memory_node} function. When registering a piece of data
  2893. to StarPU, the specified memory node indicates where the piece of data
  2894. initially resides (we also call this memory node the home node of a piece of
  2895. data).
  2896. @end table
  2897. @node starpu_data_handle
  2898. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  2899. @table @asis
  2900. @item @emph{Description}:
  2901. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  2902. data. Once a piece of data has been registered to StarPU, it is associated to a
  2903. @code{starpu_data_handle} which keeps track of the state of the piece of data
  2904. over the entire machine, so that we can maintain data consistency and locate
  2905. data replicates for instance.
  2906. @end table
  2907. @node void *interface
  2908. @subsection @code{void *interface} -- StarPU data interface
  2909. @table @asis
  2910. @item @emph{Description}:
  2911. Data management is done at a high-level in StarPU: rather than accessing a mere
  2912. list of contiguous buffers, the tasks may manipulate data that are described by
  2913. a high-level construct which we call data interface.
  2914. An example of data interface is the "vector" interface which describes a
  2915. contiguous data array on a spefic memory node. This interface is a simple
  2916. structure containing the number of elements in the array, the size of the
  2917. elements, and the address of the array in the appropriate address space (this
  2918. address may be invalid if there is no valid copy of the array in the memory
  2919. node). More informations on the data interfaces provided by StarPU are
  2920. given in @ref{Data Interfaces}.
  2921. When a piece of data managed by StarPU is used by a task, the task
  2922. implementation is given a pointer to an interface describing a valid copy of
  2923. the data that is accessible from the current processing unit.
  2924. @end table
  2925. @node starpu_data_register
  2926. @subsection @code{starpu_data_register} -- Register a piece of data to StarPU
  2927. @deftypefun void starpu_data_register (starpu_data_handle *@var{handleptr}, uint32_t @var{home_node}, void *@var{interface}, {struct starpu_data_interface_ops_t} *@var{ops})
  2928. Register a piece of data into the handle located at the @var{handleptr}
  2929. address. The @var{interface} buffer contains the initial description of the
  2930. data in the home node. The @var{ops} argument is a pointer to a structure
  2931. describing the different methods used to manipulate this type of interface. See
  2932. @ref{struct starpu_data_interface_ops_t} for more details on this structure.
  2933. If @code{home_node} is -1, StarPU will automatically
  2934. allocate the memory when it is used for the
  2935. first time in write-only mode. Once such data handle has been automatically
  2936. allocated, it is possible to access it using any access mode.
  2937. Note that StarPU supplies a set of predefined types of interface (e.g. vector or
  2938. matrix) which can be registered by the means of helper functions (e.g.
  2939. @code{starpu_vector_data_register} or @code{starpu_matrix_data_register}).
  2940. @end deftypefun
  2941. @node starpu_data_unregister
  2942. @subsection @code{starpu_data_unregister} -- Unregister a piece of data from StarPU
  2943. @deftypefun void starpu_data_unregister (starpu_data_handle @var{handle})
  2944. This function unregisters a data handle from StarPU. If the data was
  2945. automatically allocated by StarPU because the home node was -1, all
  2946. automatically allocated buffers are freed. Otherwise, a valid copy of the data
  2947. is put back into the home node in the buffer that was initially registered.
  2948. Using a data handle that has been unregistered from StarPU results in an
  2949. undefined behaviour.
  2950. @end deftypefun
  2951. @node starpu_data_unregister_no_coherency
  2952. @subsection @code{starpu_data_unregister_no_coherency} -- Unregister a piece of data from StarPU
  2953. @deftypefun void starpu_data_unregister_no_coherency (starpu_data_handle @var{handle})
  2954. This is the same as starpu_data_unregister, except that StarPU does not put back
  2955. a valid copy into the home node, in the buffer that was initially registered.
  2956. @end deftypefun
  2957. @node starpu_data_invalidate
  2958. @subsection @code{starpu_data_invalidate} -- Invalidate all data replicates
  2959. @deftypefun void starpu_data_invalidate (starpu_data_handle @var{handle})
  2960. Destroy all replicates of the data handle. After data invalidation, the first
  2961. access to the handle must be performed in write-only mode. Accessing an
  2962. invalidated data in read-mode results in undefined behaviour.
  2963. @end deftypefun
  2964. @c TODO create a specific sections about user interaction with the DSM ?
  2965. @node starpu_data_acquire
  2966. @subsection @code{starpu_data_acquire} -- Access registered data from the application
  2967. @deftypefun int starpu_data_acquire (starpu_data_handle @var{handle}, starpu_access_mode @var{mode})
  2968. The application must call this function prior to accessing registered data from
  2969. main memory outside tasks. StarPU ensures that the application will get an
  2970. up-to-date copy of the data in main memory located where the data was
  2971. originally registered, and that all concurrent accesses (e.g. from tasks) will
  2972. be consistent with the access mode specified in the @var{mode} argument.
  2973. @code{starpu_data_release} must be called once the application does not need to
  2974. access the piece of data anymore. Note that implicit data
  2975. dependencies are also enforced by @code{starpu_data_acquire}, i.e.
  2976. @code{starpu_data_acquire} will wait for all tasks scheduled to work on
  2977. the data, unless that they have not been disabled explictly by calling
  2978. @code{starpu_data_set_default_sequential_consistency_flag} or
  2979. @code{starpu_data_set_sequential_consistency_flag}.
  2980. @code{starpu_data_acquire} is a blocking call, so that it cannot be called from
  2981. tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
  2982. @code{-EDEADLK}). Upon successful completion, this function returns 0.
  2983. @end deftypefun
  2984. @node starpu_data_acquire_cb
  2985. @subsection @code{starpu_data_acquire_cb} -- Access registered data from the application asynchronously
  2986. @deftypefun int starpu_data_acquire_cb (starpu_data_handle @var{handle}, starpu_access_mode @var{mode}, void (*@var{callback})(void *), void *@var{arg})
  2987. @code{starpu_data_acquire_cb} is the asynchronous equivalent of
  2988. @code{starpu_data_release}. When the data specified in the first argument is
  2989. available in the appropriate access mode, the callback function is executed.
  2990. The application may access the requested data during the execution of this
  2991. callback. The callback function must call @code{starpu_data_release} once the
  2992. application does not need to access the piece of data anymore.
  2993. Note that implicit data dependencies are also enforced by
  2994. @code{starpu_data_acquire_cb} in case they are enabled.
  2995. Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
  2996. be called from task callbacks. Upon successful completion, this function
  2997. returns 0.
  2998. @end deftypefun
  2999. @node STARPU_DATA_ACQUIRE_CB
  3000. @subsection @code{STARPU_DATA_ACQUIRE_CB} -- Access registered data from the application asynchronously, macro
  3001. @deftypefun STARPU_DATA_ACQUIRE_CB (starpu_data_handle @var{handle}, starpu_access_mode @var{mode}, code)
  3002. @code{STARPU_DATA_ACQUIRE_CB} is the same as @code{starpu_data_acquire_cb},
  3003. except that the code to be executed in a callback is directly provided as a
  3004. macro parameter, and the data handle is automatically released after it. This
  3005. permit to easily execute code which depends on the value of some registered
  3006. data. This is non-blocking too and may be called from task callbacks.
  3007. @end deftypefun
  3008. @node starpu_data_release
  3009. @subsection @code{starpu_data_release} -- Release registered data from the application
  3010. @deftypefun void starpu_data_release (starpu_data_handle @var{handle})
  3011. This function releases the piece of data acquired by the application either by
  3012. @code{starpu_data_acquire} or by @code{starpu_data_acquire_cb}.
  3013. @end deftypefun
  3014. @node starpu_data_set_wt_mask
  3015. @subsection @code{starpu_data_set_wt_mask} -- Set the Write-Through mask
  3016. @deftypefun void starpu_data_set_wt_mask (starpu_data_handle @var{handle}, uint32_t @var{wt_mask})
  3017. This function sets the write-through mask of a given data, i.e. a bitmask of
  3018. nodes where the data should be always replicated after modification.
  3019. @end deftypefun
  3020. @node starpu_data_prefetch_on_node
  3021. @subsection @code{starpu_data_prefetch_on_node} -- Prefetch data to a given node
  3022. @deftypefun int starpu_data_prefetch_on_node (starpu_data_handle @var{handle}, unsigned @var{node}, unsigned @var{async})
  3023. Issue a prefetch request for a given data to a given node, i.e.
  3024. requests that the data be replicated to the given node, so that it is available
  3025. there for tasks. If the @var{async} parameter is 0, the call will block until
  3026. the transfer is achieved, else the call will return as soon as the request is
  3027. scheduled (which may however have to wait for a task completion).
  3028. @end deftypefun
  3029. @node Data Interfaces
  3030. @section Data Interfaces
  3031. @menu
  3032. * Variable Interface::
  3033. * Vector Interface::
  3034. * Matrix Interface::
  3035. * 3D Matrix Interface::
  3036. * BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)::
  3037. * CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)::
  3038. @end menu
  3039. @node Variable Interface
  3040. @subsection Variable Interface
  3041. @table @asis
  3042. @item @emph{Description}:
  3043. This variant of @code{starpu_data_register} uses the variable interface,
  3044. i.e. for a mere single variable. @code{ptr} is the address of the variable,
  3045. and @code{elemsize} is the size of the variable.
  3046. @item @emph{Prototype}:
  3047. @code{void starpu_variable_data_register(starpu_data_handle *handle,
  3048. uint32_t home_node,
  3049. uintptr_t ptr, size_t elemsize);}
  3050. @item @emph{Example}:
  3051. @cartouche
  3052. @smallexample
  3053. float var;
  3054. starpu_data_handle var_handle;
  3055. starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));
  3056. @end smallexample
  3057. @end cartouche
  3058. @end table
  3059. @node Vector Interface
  3060. @subsection Vector Interface
  3061. @table @asis
  3062. @item @emph{Description}:
  3063. This variant of @code{starpu_data_register} uses the vector interface,
  3064. i.e. for mere arrays of elements. @code{ptr} is the address of the first
  3065. element in the home node. @code{nx} is the number of elements in the vector.
  3066. @code{elemsize} is the size of each element.
  3067. @item @emph{Prototype}:
  3068. @code{void starpu_vector_data_register(starpu_data_handle *handle, uint32_t home_node,
  3069. uintptr_t ptr, uint32_t nx, size_t elemsize);}
  3070. @item @emph{Example}:
  3071. @cartouche
  3072. @smallexample
  3073. float vector[NX];
  3074. starpu_data_handle vector_handle;
  3075. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
  3076. sizeof(vector[0]));
  3077. @end smallexample
  3078. @end cartouche
  3079. @end table
  3080. @node Matrix Interface
  3081. @subsection Matrix Interface
  3082. @table @asis
  3083. @item @emph{Description}:
  3084. This variant of @code{starpu_data_register} uses the matrix interface, i.e. for
  3085. matrices of elements. @code{ptr} is the address of the first element in the home
  3086. node. @code{ld} is the number of elements between rows. @code{nx} is the number
  3087. of elements in a row (this can be different from @code{ld} if there are extra
  3088. elements for alignment for instance). @code{ny} is the number of rows.
  3089. @code{elemsize} is the size of each element.
  3090. @item @emph{Prototype}:
  3091. @code{void starpu_matrix_data_register(starpu_data_handle *handle, uint32_t home_node,
  3092. uintptr_t ptr, uint32_t ld, uint32_t nx,
  3093. uint32_t ny, size_t elemsize);}
  3094. @item @emph{Example}:
  3095. @cartouche
  3096. @smallexample
  3097. float *matrix;
  3098. starpu_data_handle matrix_handle;
  3099. matrix = (float*)malloc(width * height * sizeof(float));
  3100. starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,
  3101. width, width, height, sizeof(float));
  3102. @end smallexample
  3103. @end cartouche
  3104. @end table
  3105. @node 3D Matrix Interface
  3106. @subsection 3D Matrix Interface
  3107. @table @asis
  3108. @item @emph{Description}:
  3109. This variant of @code{starpu_data_register} uses the 3D matrix interface.
  3110. @code{ptr} is the address of the array of first element in the home node.
  3111. @code{ldy} is the number of elements between rows. @code{ldz} is the number
  3112. of rows between z planes. @code{nx} is the number of elements in a row (this
  3113. can be different from @code{ldy} if there are extra elements for alignment
  3114. for instance). @code{ny} is the number of rows in a z plane (likewise with
  3115. @code{ldz}). @code{nz} is the number of z planes. @code{elemsize} is the size of
  3116. each element.
  3117. @item @emph{Prototype}:
  3118. @code{void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,
  3119. uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
  3120. uint32_t ny, uint32_t nz, size_t elemsize);}
  3121. @item @emph{Example}:
  3122. @cartouche
  3123. @smallexample
  3124. float *block;
  3125. starpu_data_handle block_handle;
  3126. block = (float*)malloc(nx*ny*nz*sizeof(float));
  3127. starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
  3128. nx, nx*ny, nx, ny, nz, sizeof(float));
  3129. @end smallexample
  3130. @end cartouche
  3131. @end table
  3132. @node BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
  3133. @subsection BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
  3134. @deftypefun void starpu_bcsr_data_register (starpu_data_handle *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})
  3135. This variant of @code{starpu_data_register} uses the BCSR sparse matrix interface.
  3136. TODO
  3137. @end deftypefun
  3138. @node CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
  3139. @subsection CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
  3140. @deftypefun void starpu_csr_data_register (starpu_data_handle *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, size_t @var{elemsize})
  3141. This variant of @code{starpu_data_register} uses the CSR sparse matrix interface.
  3142. TODO
  3143. @end deftypefun
  3144. @node Data Partition
  3145. @section Data Partition
  3146. @menu
  3147. * struct starpu_data_filter:: StarPU filter structure
  3148. * starpu_data_partition:: Partition Data
  3149. * starpu_data_unpartition:: Unpartition Data
  3150. * starpu_data_get_nb_children::
  3151. * starpu_data_get_sub_data::
  3152. * Predefined filter functions::
  3153. @end menu
  3154. @node struct starpu_data_filter
  3155. @subsection @code{struct starpu_data_filter} -- StarPU filter structure
  3156. @table @asis
  3157. @item @emph{Description}:
  3158. The filter structure describes a data partitioning operation, to be given to the
  3159. @code{starpu_data_partition} function, see @ref{starpu_data_partition} for an example.
  3160. @item @emph{Fields}:
  3161. @table @asis
  3162. @item @code{filter_func}:
  3163. This function fills the @code{child_interface} structure with interface
  3164. information for the @code{id}-th child of the parent @code{father_interface} (among @code{nparts}).
  3165. @code{void (*filter_func)(void *father_interface, void* child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts);}
  3166. @item @code{nchildren}:
  3167. This is the number of parts to partition the data into.
  3168. @item @code{get_nchildren}:
  3169. This returns the number of children. This can be used instead of @code{nchildren} when the number of
  3170. children depends on the actual data (e.g. the number of blocks in a sparse
  3171. matrix).
  3172. @code{unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle initial_handle);}
  3173. @item @code{get_child_ops}:
  3174. In case the resulting children use a different data interface, this function
  3175. returns which interface is used by child number @code{id}.
  3176. @code{struct starpu_data_interface_ops_t *(*get_child_ops)(struct starpu_data_filter *, unsigned id);}
  3177. @item @code{filter_arg}:
  3178. Some filters take an addition parameter, but this is usually unused.
  3179. @item @code{filter_arg_ptr}:
  3180. Some filters take an additional array parameter like the sizes of the parts, but
  3181. this is usually unused.
  3182. @end table
  3183. @end table
  3184. @node starpu_data_partition
  3185. @subsection starpu_data_partition -- Partition Data
  3186. @table @asis
  3187. @item @emph{Description}:
  3188. This requests partitioning one StarPU data @code{initial_handle} into several
  3189. subdata according to the filter @code{f}
  3190. @item @emph{Prototype}:
  3191. @code{void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f);}
  3192. @item @emph{Example}:
  3193. @cartouche
  3194. @smallexample
  3195. struct starpu_data_filter f = @{
  3196. .filter_func = starpu_vertical_block_filter_func,
  3197. .nchildren = nslicesx,
  3198. .get_nchildren = NULL,
  3199. .get_child_ops = NULL
  3200. @};
  3201. starpu_data_partition(A_handle, &f);
  3202. @end smallexample
  3203. @end cartouche
  3204. @end table
  3205. @node starpu_data_unpartition
  3206. @subsection starpu_data_unpartition -- Unpartition data
  3207. @table @asis
  3208. @item @emph{Description}:
  3209. This unapplies one filter, thus unpartitioning the data. The pieces of data are
  3210. collected back into one big piece in the @code{gathering_node} (usually 0).
  3211. @item @emph{Prototype}:
  3212. @code{void starpu_data_unpartition(starpu_data_handle root_data, uint32_t gathering_node);}
  3213. @item @emph{Example}:
  3214. @cartouche
  3215. @smallexample
  3216. starpu_data_unpartition(A_handle, 0);
  3217. @end smallexample
  3218. @end cartouche
  3219. @end table
  3220. @node starpu_data_get_nb_children
  3221. @subsection starpu_data_get_nb_children
  3222. @table @asis
  3223. @item @emph{Description}:
  3224. This function returns the number of children.
  3225. @item @emph{Return value}:
  3226. The number of children.
  3227. @item @emph{Prototype}:
  3228. @code{int starpu_data_get_nb_children(starpu_data_handle handle);}
  3229. @end table
  3230. @c starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i);
  3231. @node starpu_data_get_sub_data
  3232. @subsection starpu_data_get_sub_data
  3233. @table @asis
  3234. @item @emph{Description}:
  3235. After partitioning a StarPU data by applying a filter,
  3236. @code{starpu_data_get_sub_data} can be used to get handles for each of the data
  3237. portions. @code{root_data} is the parent data that was partitioned. @code{depth}
  3238. is the number of filters to traverse (in case several filters have been applied,
  3239. to e.g. partition in row blocks, and then in column blocks), and the subsequent
  3240. parameters are the indexes.
  3241. @item @emph{Return value}:
  3242. A handle to the subdata.
  3243. @item @emph{Prototype}:
  3244. @code{starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_data, unsigned depth, ... );}
  3245. @item @emph{Example}:
  3246. @cartouche
  3247. @smallexample
  3248. h = starpu_data_get_sub_data(A_handle, 1, taskx);
  3249. @end smallexample
  3250. @end cartouche
  3251. @end table
  3252. @node Predefined filter functions
  3253. @subsection Predefined filter functions
  3254. @menu
  3255. * Partitioning BCSR Data::
  3256. * Partitioning BLAS interface::
  3257. * Partitioning Vector Data::
  3258. * Partitioning Block Data::
  3259. @end menu
  3260. This section gives a partial list of the predefined partitioning functions.
  3261. Examples on how to use them are shown in @ref{Partitioning Data}. The complete
  3262. list can be found in @code{starpu_data_filters.h} .
  3263. @node Partitioning BCSR Data
  3264. @subsubsection Partitioning BCSR Data
  3265. @deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3266. TODO
  3267. @end deftypefun
  3268. @deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3269. TODO
  3270. @end deftypefun
  3271. @node Partitioning BLAS interface
  3272. @subsubsection Partitioning BLAS interface
  3273. @deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3274. This partitions a dense Matrix into horizontal blocks.
  3275. @end deftypefun
  3276. @deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3277. This partitions a dense Matrix into vertical blocks.
  3278. @end deftypefun
  3279. @node Partitioning Vector Data
  3280. @subsubsection Partitioning Vector Data
  3281. @deftypefun void starpu_block_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3282. This partitions a vector into blocks of the same size.
  3283. @end deftypefun
  3284. @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3285. This partitions a vector into blocks of sizes given in the @var{filter_arg_ptr}
  3286. field of @var{f}, supposed to point on a @code{uint32_t*} array.
  3287. @end deftypefun
  3288. @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3289. This partitions a vector into two blocks, the first block size being given in
  3290. the @var{filter_arg} field of @var{f}.
  3291. @end deftypefun
  3292. @node Partitioning Block Data
  3293. @subsubsection Partitioning Block Data
  3294. @deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
  3295. This partitions a 3D matrix along the X axis.
  3296. @end deftypefun
  3297. @node Codelets and Tasks
  3298. @section Codelets and Tasks
  3299. This section describes the interface to manipulate codelets and tasks.
  3300. @deftp {Data Type} {struct starpu_codelet}
  3301. The codelet structure describes a kernel that is possibly implemented on various
  3302. targets. For compatibility, make sure to initialize the whole structure to zero.
  3303. @table @asis
  3304. @item @code{where}
  3305. Indicates which types of processing units are able to execute the codelet.
  3306. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  3307. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  3308. indicates that it is only available on Cell SPUs.
  3309. @item @code{cpu_func} (optional)
  3310. Is a function pointer to the CPU implementation of the codelet. Its prototype
  3311. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  3312. argument being the array of data managed by the data management library, and
  3313. the second argument is a pointer to the argument passed from the @code{cl_arg}
  3314. field of the @code{starpu_task} structure.
  3315. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  3316. the @code{where} field, it must be non-null otherwise.
  3317. @item @code{cuda_func} (optional)
  3318. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  3319. must be a host-function written in the CUDA runtime API}. Its prototype must
  3320. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  3321. field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
  3322. field, it must be non-null otherwise.
  3323. @item @code{opencl_func} (optional)
  3324. Is a function pointer to the OpenCL implementation of the codelet. Its
  3325. prototype must be:
  3326. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  3327. This pointer is ignored if @code{STARPU_OPENCL} does not appear in the
  3328. @code{where} field, it must be non-null otherwise.
  3329. @item @code{gordon_func} (optional)
  3330. This is the index of the Cell SPU implementation within the Gordon library.
  3331. See Gordon documentation for more details on how to register a kernel and
  3332. retrieve its index.
  3333. @item @code{nbuffers}
  3334. Specifies the number of arguments taken by the codelet. These arguments are
  3335. managed by the DSM and are accessed from the @code{void *buffers[]}
  3336. array. The constant argument passed with the @code{cl_arg} field of the
  3337. @code{starpu_task} structure is not counted in this number. This value should
  3338. not be above @code{STARPU_NMAXBUFS}.
  3339. @item @code{model} (optional)
  3340. This is a pointer to the task duration performance model associated to this
  3341. codelet. This optional field is ignored when set to @code{NULL}.
  3342. TODO
  3343. @item @code{power_model} (optional)
  3344. This is a pointer to the task power consumption performance model associated
  3345. to this codelet. This optional field is ignored when set to @code{NULL}.
  3346. In the case of parallel codelets, this has to account for all processing units
  3347. involved in the parallel execution.
  3348. TODO
  3349. @end table
  3350. @end deftp
  3351. @deftp {Data Type} {struct starpu_task}
  3352. The @code{starpu_task} structure describes a task that can be offloaded on the various
  3353. processing units managed by StarPU. It instantiates a codelet. It can either be
  3354. allocated dynamically with the @code{starpu_task_create} method, or declared
  3355. statically. In the latter case, the programmer has to zero the
  3356. @code{starpu_task} structure and to fill the different fields properly. The
  3357. indicated default values correspond to the configuration of a task allocated
  3358. with @code{starpu_task_create}.
  3359. @table @asis
  3360. @item @code{cl}
  3361. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  3362. describes where the kernel should be executed, and supplies the appropriate
  3363. implementations. When set to @code{NULL}, no code is executed during the tasks,
  3364. such empty tasks can be useful for synchronization purposes.
  3365. @item @code{buffers}
  3366. Is an array of @code{starpu_buffer_descr_t} structures. It describes the
  3367. different pieces of data accessed by the task, and how they should be accessed.
  3368. The @code{starpu_buffer_descr_t} structure is composed of two fields, the
  3369. @code{handle} field specifies the handle of the piece of data, and the
  3370. @code{mode} field is the required access mode (eg @code{STARPU_RW}). The number
  3371. of entries in this array must be specified in the @code{nbuffers} field of the
  3372. @code{starpu_codelet} structure, and should not excede @code{STARPU_NMAXBUFS}.
  3373. If unsufficient, this value can be set with the @code{--enable-maxbuffers}
  3374. option when configuring StarPU.
  3375. @item @code{cl_arg} (optional; default: @code{NULL})
  3376. This pointer is passed to the codelet through the second argument
  3377. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  3378. In the specific case of the Cell processor, see the @code{cl_arg_size}
  3379. argument.
  3380. @item @code{cl_arg_size} (optional, Cell-specific)
  3381. In the case of the Cell processor, the @code{cl_arg} pointer is not directly
  3382. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  3383. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  3384. at address @code{cl_arg}. In this case, the argument given to the SPU codelet
  3385. is therefore not the @code{cl_arg} pointer, but the address of the buffer in
  3386. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  3387. codelets, where the @code{cl_arg} pointer is given as such.
  3388. @item @code{callback_func} (optional) (default: @code{NULL})
  3389. This is a function pointer of prototype @code{void (*f)(void *)} which
  3390. specifies a possible callback. If this pointer is non-null, the callback
  3391. function is executed @emph{on the host} after the execution of the task. The
  3392. callback is passed the value contained in the @code{callback_arg} field. No
  3393. callback is executed if the field is set to @code{NULL}.
  3394. @item @code{callback_arg} (optional) (default: @code{NULL})
  3395. This is the pointer passed to the callback function. This field is ignored if
  3396. the @code{callback_func} is set to @code{NULL}.
  3397. @item @code{use_tag} (optional) (default: @code{0})
  3398. If set, this flag indicates that the task should be associated with the tag
  3399. contained in the @code{tag_id} field. Tag allow the application to synchronize
  3400. with the task and to express task dependencies easily.
  3401. @item @code{tag_id}
  3402. This fields contains the tag associated to the task if the @code{use_tag} field
  3403. was set, it is ignored otherwise.
  3404. @item @code{synchronous}
  3405. If this flag is set, the @code{starpu_task_submit} function is blocking and
  3406. returns only when the task has been executed (or if no worker is able to
  3407. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  3408. @item @code{priority} (optional) (default: @code{STARPU_DEFAULT_PRIO})
  3409. This field indicates a level of priority for the task. This is an integer value
  3410. that must be set between the return values of the
  3411. @code{starpu_sched_get_min_priority} function for the least important tasks,
  3412. and that of the @code{starpu_sched_get_max_priority} for the most important
  3413. tasks (included). The @code{STARPU_MIN_PRIO} and @code{STARPU_MAX_PRIO} macros
  3414. are provided for convenience and respectively returns value of
  3415. @code{starpu_sched_get_min_priority} and @code{starpu_sched_get_max_priority}.
  3416. Default priority is @code{STARPU_DEFAULT_PRIO}, which is always defined as 0 in
  3417. order to allow static task initialization. Scheduling strategies that take
  3418. priorities into account can use this parameter to take better scheduling
  3419. decisions, but the scheduling policy may also ignore it.
  3420. @item @code{execute_on_a_specific_worker} (default: @code{0})
  3421. If this flag is set, StarPU will bypass the scheduler and directly affect this
  3422. task to the worker specified by the @code{workerid} field.
  3423. @item @code{workerid} (optional)
  3424. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  3425. which is the identifier of the worker that should process this task (as
  3426. returned by @code{starpu_worker_get_id}). This field is ignored if
  3427. @code{execute_on_a_specific_worker} field is set to 0.
  3428. @item @code{detach} (optional) (default: @code{1})
  3429. If this flag is set, it is not possible to synchronize with the task
  3430. by the means of @code{starpu_task_wait} later on. Internal data structures
  3431. are only guaranteed to be freed once @code{starpu_task_wait} is called if the
  3432. flag is not set.
  3433. @item @code{destroy} (optional) (default: @code{1})
  3434. If this flag is set, the task structure will automatically be freed, either
  3435. after the execution of the callback if the task is detached, or during
  3436. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  3437. allocated data structures will not be freed until @code{starpu_task_destroy} is
  3438. called explicitly. Setting this flag for a statically allocated task structure
  3439. will result in undefined behaviour.
  3440. @item @code{predicted} (output field)
  3441. Predicted duration of the task. This field is only set if the scheduling
  3442. strategy used performance models.
  3443. @end table
  3444. @end deftp
  3445. @deftypefun void starpu_task_init ({struct starpu_task} *@var{task})
  3446. Initialize @var{task} with default values. This function is implicitly
  3447. called by @code{starpu_task_create}. By default, tasks initialized with
  3448. @code{starpu_task_init} must be deinitialized explicitly with
  3449. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  3450. constant @code{STARPU_TASK_INITIALIZER}.
  3451. @end deftypefun
  3452. @deftypefun {struct starpu_task *} starpu_task_create (void)
  3453. Allocate a task structure and initialize it with default values. Tasks
  3454. allocated dynamically with @code{starpu_task_create} are automatically freed when the
  3455. task is terminated. If the destroy flag is explicitly unset, the resources used
  3456. by the task are freed by calling
  3457. @code{starpu_task_destroy}.
  3458. @end deftypefun
  3459. @deftypefun void starpu_task_deinit ({struct starpu_task} *@var{task})
  3460. Release all the structures automatically allocated to execute @var{task}. This is
  3461. called automatically by @code{starpu_task_destroy}, but the task structure itself is not
  3462. freed. This should be used for statically allocated tasks for instance.
  3463. @end deftypefun
  3464. @deftypefun void starpu_task_destroy ({struct starpu_task} *@var{task})
  3465. Free the resource allocated during @code{starpu_task_create} and
  3466. associated with @var{task}. This function can be called automatically
  3467. after the execution of a task by setting the @code{destroy} flag of the
  3468. @code{starpu_task} structure (default behaviour). Calling this function
  3469. on a statically allocated task results in an undefined behaviour.
  3470. @end deftypefun
  3471. @deftypefun int starpu_task_wait ({struct starpu_task} *@var{task})
  3472. This function blocks until @var{task} has been executed. It is not possible to
  3473. synchronize with a task more than once. It is not possible to wait for
  3474. synchronous or detached tasks.
  3475. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  3476. indicates that the specified task was either synchronous or detached.
  3477. @end deftypefun
  3478. @deftypefun int starpu_task_submit ({struct starpu_task} *@var{task})
  3479. This function submits @var{task} to StarPU. Calling this function does
  3480. not mean that the task will be executed immediately as there can be data or task
  3481. (tag) dependencies that are not fulfilled yet: StarPU will take care of
  3482. scheduling this task with respect to such dependencies.
  3483. This function returns immediately if the @code{synchronous} field of the
  3484. @code{starpu_task} structure was set to 0, and block until the termination of
  3485. the task otherwise. It is also possible to synchronize the application with
  3486. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  3487. function for instance.
  3488. In case of success, this function returns 0, a return value of @code{-ENODEV}
  3489. means that there is no worker able to process this task (e.g. there is no GPU
  3490. available and this task is only implemented for CUDA devices).
  3491. @end deftypefun
  3492. @deftypefun int starpu_task_wait_for_all (void)
  3493. This function blocks until all the tasks that were submitted are terminated.
  3494. @end deftypefun
  3495. @deftypefun {struct starpu_task *} starpu_get_current_task (void)
  3496. This function returns the task currently executed by the worker, or
  3497. NULL if it is called either from a thread that is not a task or simply
  3498. because there is no task being executed at the moment.
  3499. @end deftypefun
  3500. @deftypefun void starpu_display_codelet_stats ({struct starpu_codelet_t} *@var{cl})
  3501. Output on @code{stderr} some statistics on the codelet @var{cl}.
  3502. @end deftypefun
  3503. @c Callbacks : what can we put in callbacks ?
  3504. @node Explicit Dependencies
  3505. @section Explicit Dependencies
  3506. @menu
  3507. * starpu_task_declare_deps_array:: starpu_task_declare_deps_array
  3508. * starpu_tag_t:: Task logical identifier
  3509. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  3510. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  3511. * starpu_tag_wait:: Block until a Tag is terminated
  3512. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  3513. * starpu_tag_remove:: Destroy a Tag
  3514. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  3515. @end menu
  3516. @node starpu_task_declare_deps_array
  3517. @subsection @code{starpu_task_declare_deps_array} -- Declare task dependencies
  3518. @deftypefun void starpu_task_declare_deps_array ({struct starpu_task} *@var{task}, unsigned @var{ndeps}, {struct starpu_task} *@var{task_array}[])
  3519. Declare task dependencies between a @var{task} and an array of tasks of length
  3520. @var{ndeps}. This function must be called prior to the submission of the task,
  3521. but it may called after the submission or the execution of the tasks in the
  3522. array provided the tasks are still valid (ie. they were not automatically
  3523. destroyed). Calling this function on a task that was already submitted or with
  3524. an entry of @var{task_array} that is not a valid task anymore results in an
  3525. undefined behaviour. If @var{ndeps} is null, no dependency is added. It is
  3526. possible to call @code{starpu_task_declare_deps_array} multiple times on the
  3527. same task, in this case, the dependencies are added. It is possible to have
  3528. redundancy in the task dependencies.
  3529. @end deftypefun
  3530. @node starpu_tag_t
  3531. @subsection @code{starpu_tag_t} -- Task logical identifier
  3532. @table @asis
  3533. @item @emph{Description}:
  3534. It is possible to associate a task with a unique ``tag'' chosen by the application, and to express
  3535. dependencies between tasks by the means of those tags. To do so, fill the
  3536. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  3537. be arbitrary) and set the @code{use_tag} field to 1.
  3538. If @code{starpu_tag_declare_deps} is called with this tag number, the task will
  3539. not be started until the tasks which holds the declared dependency tags are
  3540. completed.
  3541. @end table
  3542. @node starpu_tag_declare_deps
  3543. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  3544. @table @asis
  3545. @item @emph{Description}:
  3546. Specify the dependencies of the task identified by tag @code{id}. The first
  3547. argument specifies the tag which is configured, the second argument gives the
  3548. number of tag(s) on which @code{id} depends. The following arguments are the
  3549. tags which have to be terminated to unlock the task.
  3550. This function must be called before the associated task is submitted to StarPU
  3551. with @code{starpu_task_submit}.
  3552. @item @emph{Remark}
  3553. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  3554. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  3555. typically need to be explicitly casted. Using the
  3556. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  3557. @item @emph{Prototype}:
  3558. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  3559. @item @emph{Example}:
  3560. @cartouche
  3561. @example
  3562. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  3563. starpu_tag_declare_deps((starpu_tag_t)0x1,
  3564. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  3565. @end example
  3566. @end cartouche
  3567. @end table
  3568. @node starpu_tag_declare_deps_array
  3569. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  3570. @table @asis
  3571. @item @emph{Description}:
  3572. This function is similar to @code{starpu_tag_declare_deps}, except that its
  3573. does not take a variable number of arguments but an array of tags of size
  3574. @code{ndeps}.
  3575. @item @emph{Prototype}:
  3576. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  3577. @item @emph{Example}:
  3578. @cartouche
  3579. @example
  3580. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  3581. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  3582. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  3583. @end example
  3584. @end cartouche
  3585. @end table
  3586. @node starpu_tag_wait
  3587. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  3588. @deftypefun void starpu_tag_wait (starpu_tag_t @var{id})
  3589. This function blocks until the task associated to tag @var{id} has been
  3590. executed. This is a blocking call which must therefore not be called within
  3591. tasks or callbacks, but only from the application directly. It is possible to
  3592. synchronize with the same tag multiple times, as long as the
  3593. @code{starpu_tag_remove} function is not called. Note that it is still
  3594. possible to synchronize with a tag associated to a task which @code{starpu_task}
  3595. data structure was freed (e.g. if the @code{destroy} flag of the
  3596. @code{starpu_task} was enabled).
  3597. @end deftypefun
  3598. @node starpu_tag_wait_array
  3599. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  3600. @deftypefun void starpu_tag_wait_array (unsigned @var{ntags}, starpu_tag_t *@var{id})
  3601. This function is similar to @code{starpu_tag_wait} except that it blocks until
  3602. @emph{all} the @var{ntags} tags contained in the @var{id} array are
  3603. terminated.
  3604. @end deftypefun
  3605. @node starpu_tag_remove
  3606. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  3607. @deftypefun void starpu_tag_remove (starpu_tag_t @var{id})
  3608. This function releases the resources associated to tag @var{id}. It can be
  3609. called once the corresponding task has been executed and when there is
  3610. no other tag that depend on this tag anymore.
  3611. @end deftypefun
  3612. @node starpu_tag_notify_from_apps
  3613. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  3614. @deftypefun void starpu_tag_notify_from_apps (starpu_tag_t @var{id})
  3615. This function explicitly unlocks tag @var{id}. It may be useful in the
  3616. case of applications which execute part of their computation outside StarPU
  3617. tasks (e.g. third-party libraries). It is also provided as a
  3618. convenient tool for the programmer, for instance to entirely construct the task
  3619. DAG before actually giving StarPU the opportunity to execute the tasks.
  3620. @end deftypefun
  3621. @node Implicit Data Dependencies
  3622. @section Implicit Data Dependencies
  3623. @menu
  3624. * starpu_data_set_default_sequential_consistency_flag:: starpu_data_set_default_sequential_consistency_flag
  3625. * starpu_data_get_default_sequential_consistency_flag:: starpu_data_get_default_sequential_consistency_flag
  3626. * starpu_data_set_sequential_consistency_flag:: starpu_data_set_sequential_consistency_flag
  3627. @end menu
  3628. In this section, we describe how StarPU makes it possible to insert implicit
  3629. task dependencies in order to enforce sequential data consistency. When this
  3630. data consistency is enabled on a specific data handle, any data access will
  3631. appear as sequentially consistent from the application. For instance, if the
  3632. application submits two tasks that access the same piece of data in read-only
  3633. mode, and then a third task that access it in write mode, dependencies will be
  3634. added between the two first tasks and the third one. Implicit data dependencies
  3635. are also inserted in the case of data accesses from the application.
  3636. @node starpu_data_set_default_sequential_consistency_flag
  3637. @subsection @code{starpu_data_set_default_sequential_consistency_flag} -- Set default sequential consistency flag
  3638. @deftypefun void starpu_data_set_default_sequential_consistency_flag (unsigned @var{flag})
  3639. Set the default sequential consistency flag. If a non-zero value is passed, a
  3640. sequential data consistency will be enforced for all handles registered after
  3641. this function call, otherwise it is disabled. By default, StarPU enables
  3642. sequential data consistency. It is also possible to select the data consistency
  3643. mode of a specific data handle with the
  3644. @code{starpu_data_set_sequential_consistency_flag} function.
  3645. @end deftypefun
  3646. @node starpu_data_get_default_sequential_consistency_flag
  3647. @subsection @code{starpu_data_get_default_sequential_consistency_flag} -- Get current default sequential consistency flag
  3648. @deftypefun unsigned starpu_data_set_default_sequential_consistency_flag (void)
  3649. This function returns the current default sequential consistency flag.
  3650. @end deftypefun
  3651. @node starpu_data_set_sequential_consistency_flag
  3652. @subsection @code{starpu_data_set_sequential_consistency_flag} -- Set data sequential consistency mode
  3653. @deftypefun void starpu_data_set_sequential_consistency_flag (starpu_data_handle @var{handle}, unsigned @var{flag})
  3654. Select the data consistency mode associated to a data handle. The consistency
  3655. mode set using this function has the priority over the default mode which can
  3656. be set with @code{starpu_data_set_sequential_consistency_flag}.
  3657. @end deftypefun
  3658. @node Performance Model API
  3659. @section Performance Model API
  3660. @menu
  3661. * starpu_load_history_debug::
  3662. * starpu_perfmodel_debugfilepath::
  3663. * starpu_perfmodel_get_arch_name::
  3664. * starpu_force_bus_sampling::
  3665. @end menu
  3666. @node starpu_load_history_debug
  3667. @subsection @code{starpu_load_history_debug}
  3668. @deftypefun int starpu_load_history_debug ({const char} *@var{symbol}, {struct starpu_perfmodel_t} *@var{model})
  3669. TODO
  3670. @end deftypefun
  3671. @node starpu_perfmodel_debugfilepath
  3672. @subsection @code{starpu_perfmodel_debugfilepath}
  3673. @deftypefun void starpu_perfmodel_debugfilepath ({struct starpu_perfmodel_t} *@var{model}, {enum starpu_perf_archtype} @var{arch}, char *@var{path}, size_t @var{maxlen})
  3674. TODO
  3675. @end deftypefun
  3676. @node starpu_perfmodel_get_arch_name
  3677. @subsection @code{starpu_perfmodel_get_arch_name}
  3678. @deftypefun void starpu_perfmodel_get_arch_name ({enum starpu_perf_archtype} @var{arch}, char *@var{archname}, size_t @var{maxlen})
  3679. TODO
  3680. @end deftypefun
  3681. @node starpu_force_bus_sampling
  3682. @subsection @code{starpu_force_bus_sampling}
  3683. @deftypefun void starpu_force_bus_sampling (void)
  3684. This forces sampling the bus performance model again.
  3685. @end deftypefun
  3686. @node Profiling API
  3687. @section Profiling API
  3688. @menu
  3689. * starpu_profiling_status_set:: starpu_profiling_status_set
  3690. * starpu_profiling_status_get:: starpu_profiling_status_get
  3691. * struct starpu_task_profiling_info:: task profiling information
  3692. * struct starpu_worker_profiling_info:: worker profiling information
  3693. * starpu_worker_get_profiling_info:: starpu_worker_get_profiling_info
  3694. * struct starpu_bus_profiling_info:: bus profiling information
  3695. * starpu_bus_get_count::
  3696. * starpu_bus_get_id::
  3697. * starpu_bus_get_src::
  3698. * starpu_bus_get_dst::
  3699. * starpu_timing_timespec_delay_us::
  3700. * starpu_timing_timespec_to_us::
  3701. * starpu_bus_profiling_helper_display_summary::
  3702. * starpu_worker_profiling_helper_display_summary::
  3703. @end menu
  3704. @node starpu_profiling_status_set
  3705. @subsection @code{starpu_profiling_status_set} -- Set current profiling status
  3706. @table @asis
  3707. @item @emph{Description}:
  3708. Thie function sets the profiling status. Profiling is activated by passing
  3709. @code{STARPU_PROFILING_ENABLE} in @code{status}. Passing
  3710. @code{STARPU_PROFILING_DISABLE} disables profiling. Calling this function
  3711. resets all profiling measurements. When profiling is enabled, the
  3712. @code{profiling_info} field of the @code{struct starpu_task} structure points
  3713. to a valid @code{struct starpu_task_profiling_info} structure containing
  3714. information about the execution of the task.
  3715. @item @emph{Return value}:
  3716. Negative return values indicate an error, otherwise the previous status is
  3717. returned.
  3718. @item @emph{Prototype}:
  3719. @code{int starpu_profiling_status_set(int status);}
  3720. @end table
  3721. @node starpu_profiling_status_get
  3722. @subsection @code{starpu_profiling_status_get} -- Get current profiling status
  3723. @deftypefun int starpu_profiling_status_get (void)
  3724. Return the current profiling status or a negative value in case there was an error.
  3725. @end deftypefun
  3726. @node struct starpu_task_profiling_info
  3727. @subsection @code{struct starpu_task_profiling_info} -- Task profiling information
  3728. @table @asis
  3729. @item @emph{Description}:
  3730. This structure contains information about the execution of a task. It is
  3731. accessible from the @code{.profiling_info} field of the @code{starpu_task}
  3732. structure if profiling was enabled.
  3733. @item @emph{Fields}:
  3734. @table @asis
  3735. @item @code{submit_time}:
  3736. Date of task submission (relative to the initialization of StarPU).
  3737. @item @code{start_time}:
  3738. Date of task execution beginning (relative to the initialization of StarPU).
  3739. @item @code{end_time}:
  3740. Date of task execution termination (relative to the initialization of StarPU).
  3741. @item @code{workerid}:
  3742. Identifier of the worker which has executed the task.
  3743. @end table
  3744. @end table
  3745. @node struct starpu_worker_profiling_info
  3746. @subsection @code{struct starpu_worker_profiling_info} -- Worker profiling information
  3747. @table @asis
  3748. @item @emph{Description}:
  3749. This structure contains the profiling information associated to a worker.
  3750. @item @emph{Fields}:
  3751. @table @asis
  3752. @item @code{start_time}:
  3753. Starting date for the reported profiling measurements.
  3754. @item @code{total_time}:
  3755. Duration of the profiling measurement interval.
  3756. @item @code{executing_time}:
  3757. Time spent by the worker to execute tasks during the profiling measurement interval.
  3758. @item @code{sleeping_time}:
  3759. Time spent idling by the worker during the profiling measurement interval.
  3760. @item @code{executed_tasks}:
  3761. Number of tasks executed by the worker during the profiling measurement interval.
  3762. @end table
  3763. @end table
  3764. @node starpu_worker_get_profiling_info
  3765. @subsection @code{starpu_worker_get_profiling_info} -- Get worker profiling info
  3766. @table @asis
  3767. @item @emph{Description}:
  3768. Get the profiling info associated to the worker identified by @code{workerid},
  3769. and reset the profiling measurements. If the @code{worker_info} argument is
  3770. NULL, only reset the counters associated to worker @code{workerid}.
  3771. @item @emph{Return value}:
  3772. Upon successful completion, this function returns 0. Otherwise, a negative
  3773. value is returned.
  3774. @item @emph{Prototype}:
  3775. @code{int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);}
  3776. @end table
  3777. @node struct starpu_bus_profiling_info
  3778. @subsection @code{struct starpu_bus_profiling_info} -- Bus profiling information
  3779. @table @asis
  3780. @item @emph{Description}:
  3781. TODO
  3782. @item @emph{Fields}:
  3783. @table @asis
  3784. @item @code{start_time}:
  3785. TODO
  3786. @item @code{total_time}:
  3787. TODO
  3788. @item @code{transferred_bytes}:
  3789. TODO
  3790. @item @code{transfer_count}:
  3791. TODO
  3792. @end table
  3793. @end table
  3794. @node starpu_bus_get_count
  3795. @subsection @code{starpu_bus_get_count}
  3796. @deftypefun int starpu_bus_get_count (void)
  3797. TODO
  3798. @end deftypefun
  3799. @node starpu_bus_get_id
  3800. @subsection @code{starpu_bus_get_id}
  3801. @deftypefun int starpu_bus_get_id (int @var{src}, int @var{dst})
  3802. TODO
  3803. @end deftypefun
  3804. @node starpu_bus_get_src
  3805. @subsection @code{starpu_bus_get_src}
  3806. @deftypefun int starpu_bus_get_src (int @var{busid})
  3807. TODO
  3808. @end deftypefun
  3809. @node starpu_bus_get_dst
  3810. @subsection @code{starpu_bus_get_dst}
  3811. @deftypefun int starpu_bus_get_dst (int @var{busid})
  3812. TODO
  3813. @end deftypefun
  3814. @node starpu_timing_timespec_delay_us
  3815. @subsection @code{starpu_timing_timespec_delay_us}
  3816. @deftypefun double starpu_timing_timespec_delay_us ({struct timespec} *@var{start}, {struct timespec} *@var{end})
  3817. TODO
  3818. @end deftypefun
  3819. @node starpu_timing_timespec_to_us
  3820. @subsection @code{starpu_timing_timespec_to_us}
  3821. @deftypefun double starpu_timing_timespec_to_us ({struct timespec} *@var{ts})
  3822. TODO
  3823. @end deftypefun
  3824. @node starpu_bus_profiling_helper_display_summary
  3825. @subsection @code{starpu_bus_profiling_helper_display_summary}
  3826. @deftypefun void starpu_bus_profiling_helper_display_summary (void)
  3827. TODO
  3828. @end deftypefun
  3829. @node starpu_worker_profiling_helper_display_summary
  3830. @subsection @code{starpu_worker_profiling_helper_display_summary}
  3831. @deftypefun void starpu_worker_profiling_helper_display_summary (void)
  3832. TODO
  3833. @end deftypefun
  3834. @node CUDA extensions
  3835. @section CUDA extensions
  3836. @c void starpu_malloc(float **A, size_t dim);
  3837. @menu
  3838. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  3839. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  3840. * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
  3841. @end menu
  3842. @node starpu_cuda_get_local_stream
  3843. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  3844. @deftypefun {cudaStream_t *} starpu_cuda_get_local_stream (void)
  3845. StarPU provides a stream for every CUDA device controlled by StarPU. This
  3846. function is only provided for convenience so that programmers can easily use
  3847. asynchronous operations within codelets without having to create a stream by
  3848. hand. Note that the application is not forced to use the stream provided by
  3849. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  3850. Synchronizing with @code{cudaThreadSynchronize()} is allowed, but will reduce
  3851. the likelihood of having all transfers overlapped.
  3852. @end deftypefun
  3853. @node starpu_helper_cublas_init
  3854. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  3855. @deftypefun void starpu_helper_cublas_init (void)
  3856. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  3857. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  3858. controlled by StarPU. This call blocks until CUBLAS has been properly
  3859. initialized on every device.
  3860. @end deftypefun
  3861. @node starpu_helper_cublas_shutdown
  3862. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  3863. @deftypefun void starpu_helper_cublas_shutdown (void)
  3864. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  3865. @end deftypefun
  3866. @node OpenCL extensions
  3867. @section OpenCL extensions
  3868. @menu
  3869. * Compiling OpenCL kernels:: Compiling OpenCL kernels
  3870. * Loading OpenCL kernels:: Loading OpenCL kernels
  3871. * OpenCL statistics:: Collecting statistics from OpenCL
  3872. @end menu
  3873. @node Compiling OpenCL kernels
  3874. @subsection Compiling OpenCL kernels
  3875. Source codes for OpenCL kernels can be stored in a file or in a
  3876. string. StarPU provides functions to build the program executable for
  3877. each available OpenCL device as a @code{cl_program} object. This
  3878. program executable can then be loaded within a specific queue as
  3879. explained in the next section. These are only helpers, Applications
  3880. can also fill a @code{starpu_opencl_program} array by hand for more advanced
  3881. use (e.g. different programs on the different OpenCL devices, for
  3882. relocation purpose for instance).
  3883. @menu
  3884. * starpu_opencl_load_opencl_from_file:: Compiling OpenCL source code
  3885. * starpu_opencl_load_opencl_from_string:: Compiling OpenCL source code
  3886. * starpu_opencl_unload_opencl:: Releasing OpenCL code
  3887. @end menu
  3888. @node starpu_opencl_load_opencl_from_file
  3889. @subsubsection @code{starpu_opencl_load_opencl_from_file} -- Compiling OpenCL source code
  3890. @deftypefun int starpu_opencl_load_opencl_from_file (char *@var{source_file_name}, {struct starpu_opencl_program} *@var{opencl_programs}, {const char}* @var{build_options})
  3891. TODO
  3892. @end deftypefun
  3893. @node starpu_opencl_load_opencl_from_string
  3894. @subsubsection @code{starpu_opencl_load_opencl_from_string} -- Compiling OpenCL source code
  3895. @deftypefun int starpu_opencl_load_opencl_from_string (char *@var{opencl_program_source}, {struct starpu_opencl_program} *@var{opencl_programs}, {const char}* @var{build_options})
  3896. TODO
  3897. @end deftypefun
  3898. @node starpu_opencl_unload_opencl
  3899. @subsubsection @code{starpu_opencl_unload_opencl} -- Releasing OpenCL code
  3900. @deftypefun int starpu_opencl_unload_opencl ({struct starpu_opencl_program} *@var{opencl_programs})
  3901. TODO
  3902. @end deftypefun
  3903. @node Loading OpenCL kernels
  3904. @subsection Loading OpenCL kernels
  3905. @menu
  3906. * starpu_opencl_load_kernel:: Loading a kernel
  3907. * starpu_opencl_relase_kernel:: Releasing a kernel
  3908. @end menu
  3909. @node starpu_opencl_load_kernel
  3910. @subsubsection @code{starpu_opencl_load_kernel} -- Loading a kernel
  3911. @deftypefun int starpu_opencl_load_kernel (cl_kernel *@var{kernel}, cl_command_queue *@var{queue}, {struct starpu_opencl_program} *@var{opencl_programs}, char *@var{kernel_name}, int @var{devid})
  3912. TODO
  3913. @end deftypefun
  3914. @node starpu_opencl_relase_kernel
  3915. @subsubsection @code{starpu_opencl_release_kernel} -- Releasing a kernel
  3916. @deftypefun int starpu_opencl_release_kernel (cl_kernel @var{kernel})
  3917. TODO
  3918. @end deftypefun
  3919. @node OpenCL statistics
  3920. @subsection OpenCL statistics
  3921. @menu
  3922. * starpu_opencl_collect_stats:: Collect statistics on a kernel execution
  3923. @end menu
  3924. @node starpu_opencl_collect_stats
  3925. @subsubsection @code{starpu_opencl_collect_stats} -- Collect statistics on a kernel execution
  3926. @deftypefun int starpu_opencl_collect_stats (cl_event @var{event})
  3927. After termination of the kernels, the OpenCL codelet should call this function
  3928. to pass it the even returned by @code{clEnqueueNDRangeKernel}, to let StarPU
  3929. collect statistics about the kernel execution (used cycles, consumed power).
  3930. @end deftypefun
  3931. @node Cell extensions
  3932. @section Cell extensions
  3933. nothing yet.
  3934. @node Miscellaneous helpers
  3935. @section Miscellaneous helpers
  3936. @menu
  3937. * starpu_data_cpy:: Copy a data handle into another data handle
  3938. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  3939. @end menu
  3940. @node starpu_data_cpy
  3941. @subsection @code{starpu_data_cpy} -- Copy a data handle into another data handle
  3942. @deftypefun int starpu_data_cpy (starpu_data_handle @var{dst_handle}, starpu_data_handle @var{src_handle}, int @var{asynchronous}, void (*@var{callback_func})(void*), void *@var{callback_arg})
  3943. Copy the content of the @var{src_handle} into the @var{dst_handle} handle.
  3944. The @var{asynchronous} parameter indicates whether the function should
  3945. block or not. In the case of an asynchronous call, it is possible to
  3946. synchronize with the termination of this operation either by the means of
  3947. implicit dependencies (if enabled) or by calling
  3948. @code{starpu_task_wait_for_all()}. If @var{callback_func} is not @code{NULL},
  3949. this callback function is executed after the handle has been copied, and it is
  3950. given the @var{callback_arg} pointer as argument.
  3951. @end deftypefun
  3952. @node starpu_execute_on_each_worker
  3953. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  3954. @deftypefun void starpu_execute_on_each_worker (void (*@var{func})(void *), void *@var{arg}, uint32_t @var{where})
  3955. When calling this method, the offloaded function specified by the first argument is
  3956. executed by every StarPU worker that may execute the function.
  3957. The second argument is passed to the offloaded function.
  3958. The last argument specifies on which types of processing units the function
  3959. should be executed. Similarly to the @var{where} field of the
  3960. @code{starpu_codelet} structure, it is possible to specify that the function
  3961. should be executed on every CUDA device and every CPU by passing
  3962. @code{STARPU_CPU|STARPU_CUDA}.
  3963. This function blocks until the function has been executed on every appropriate
  3964. processing units, so that it may not be called from a callback function for
  3965. instance.
  3966. @end deftypefun
  3967. @c ---------------------------------------------------------------------
  3968. @c Advanced Topics
  3969. @c ---------------------------------------------------------------------
  3970. @node Advanced Topics
  3971. @chapter Advanced Topics
  3972. @menu
  3973. * Defining a new data interface::
  3974. * Defining a new scheduling policy::
  3975. @end menu
  3976. @node Defining a new data interface
  3977. @section Defining a new data interface
  3978. @menu
  3979. * struct starpu_data_interface_ops_t:: Per-interface methods
  3980. * struct starpu_data_copy_methods:: Per-interface data transfer methods
  3981. * An example of data interface:: An example of data interface
  3982. @end menu
  3983. @c void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node); TODO
  3984. @node struct starpu_data_interface_ops_t
  3985. @subsection @code{struct starpu_data_interface_ops_t} -- Per-interface methods
  3986. @table @asis
  3987. @item @emph{Description}:
  3988. TODO describe all the different fields
  3989. @end table
  3990. @node struct starpu_data_copy_methods
  3991. @subsection @code{struct starpu_data_copy_methods} -- Per-interface data transfer methods
  3992. @table @asis
  3993. @item @emph{Description}:
  3994. TODO describe all the different fields
  3995. @end table
  3996. @node An example of data interface
  3997. @subsection An example of data interface
  3998. @table @asis
  3999. TODO
  4000. See @code{src/datawizard/interfaces/vector_interface.c} for now.
  4001. @end table
  4002. @node Defining a new scheduling policy
  4003. @section Defining a new scheduling policy
  4004. TODO
  4005. A full example showing how to define a new scheduling policy is available in
  4006. the StarPU sources in the directory @code{examples/scheduler/}.
  4007. @menu
  4008. * struct starpu_sched_policy_s::
  4009. * starpu_worker_set_sched_condition::
  4010. * starpu_sched_set_min_priority:: Set the minimum priority level
  4011. * starpu_sched_set_max_priority:: Set the maximum priority level
  4012. * starpu_push_local_task:: Assign a task to a worker
  4013. * Source code::
  4014. @end menu
  4015. @node struct starpu_sched_policy_s
  4016. @subsection @code{struct starpu_sched_policy_s} -- Scheduler methods
  4017. @table @asis
  4018. @item @emph{Description}:
  4019. This structure contains all the methods that implement a scheduling policy. An
  4020. application may specify which scheduling strategy in the @code{sched_policy}
  4021. field of the @code{starpu_conf} structure passed to the @code{starpu_init}
  4022. function.
  4023. @item @emph{Fields}:
  4024. @table @asis
  4025. @item @code{init_sched}:
  4026. Initialize the scheduling policy.
  4027. @item @code{deinit_sched}:
  4028. Cleanup the scheduling policy.
  4029. @item @code{push_task}:
  4030. Insert a task into the scheduler.
  4031. @item @code{push_prio_task}:
  4032. Insert a priority task into the scheduler.
  4033. @item @code{push_prio_notify}:
  4034. Notify the scheduler that a task was pushed on the worker. This method is
  4035. called when a task that was explicitely assigned to a worker is scheduled. This
  4036. method therefore permits to keep the state of of the scheduler coherent even
  4037. when StarPU bypasses the scheduling strategy.
  4038. @item @code{pop_task}:
  4039. Get a task from the scheduler. The mutex associated to the worker is already
  4040. taken when this method is called. If this method is defined as @code{NULL}, the
  4041. worker will only execute tasks from its local queue. In this case, the
  4042. @code{push_task} method should use the @code{starpu_push_local_task} method to
  4043. assign tasks to the different workers.
  4044. @item @code{pop_every_task}:
  4045. Remove all available tasks from the scheduler (tasks are chained by the means
  4046. of the prev and next fields of the starpu_task structure). The mutex associated
  4047. to the worker is already taken when this method is called.
  4048. @item @code{post_exec_hook} (optionnal):
  4049. This method is called every time a task has been executed.
  4050. @item @code{policy_name}:
  4051. Name of the policy (optionnal).
  4052. @item @code{policy_description}:
  4053. Description of the policy (optionnal).
  4054. @end table
  4055. @end table
  4056. @node starpu_worker_set_sched_condition
  4057. @subsection @code{starpu_worker_set_sched_condition} -- Specify the condition variable associated to a worker
  4058. @deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
  4059. When there is no available task for a worker, StarPU blocks this worker on a
  4060. condition variable. This function specifies which condition variable (and the
  4061. associated mutex) should be used to block (and to wake up) a worker. Note that
  4062. multiple workers may use the same condition variable. For instance, in the case
  4063. of a scheduling strategy with a single task queue, the same condition variable
  4064. would be used to block and wake up all workers.
  4065. The initialization method of a scheduling strategy (@code{init_sched}) must
  4066. call this function once per worker.
  4067. @end deftypefun
  4068. @node starpu_sched_set_min_priority
  4069. @subsection @code{starpu_sched_set_min_priority}
  4070. @deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
  4071. Defines the minimum priority level supported by the scheduling policy. The
  4072. default minimum priority level is the same as the default priority level which
  4073. is 0 by convention. The application may access that value by calling the
  4074. @code{starpu_sched_get_min_priority} function. This function should only be
  4075. called from the initialization method of the scheduling policy, and should not
  4076. be used directly from the application.
  4077. @end deftypefun
  4078. @node starpu_sched_set_max_priority
  4079. @subsection @code{starpu_sched_set_max_priority}
  4080. @deftypefun void starpu_sched_set_min_priority (int @var{max_prio})
  4081. Defines the maximum priority level supported by the scheduling policy. The
  4082. default maximum priority level is 1. The application may access that value by
  4083. calling the @code{starpu_sched_get_max_priority} function. This function should
  4084. only be called from the initialization method of the scheduling policy, and
  4085. should not be used directly from the application.
  4086. @end deftypefun
  4087. @node starpu_push_local_task
  4088. @subsection @code{starpu_push_local_task}
  4089. @deftypefun int starpu_push_local_task (int @var{workerid}, {struct starpu_task} *@var{task}, int @var{back})
  4090. The scheduling policy may put tasks directly into a worker's local queue so
  4091. that it is not always necessary to create its own queue when the local queue
  4092. is sufficient. If "back" not null, the task is put at the back of the queue
  4093. where the worker will pop tasks first. Setting "back" to 0 therefore ensures
  4094. a FIFO ordering.
  4095. @end deftypefun
  4096. @node Source code
  4097. @subsection Source code
  4098. @cartouche
  4099. @smallexample
  4100. static struct starpu_sched_policy_s dummy_sched_policy = @{
  4101. .init_sched = init_dummy_sched,
  4102. .deinit_sched = deinit_dummy_sched,
  4103. .push_task = push_task_dummy,
  4104. .push_prio_task = NULL,
  4105. .pop_task = pop_task_dummy,
  4106. .post_exec_hook = NULL,
  4107. .pop_every_task = NULL,
  4108. .policy_name = "dummy",
  4109. .policy_description = "dummy scheduling strategy"
  4110. @};
  4111. @end smallexample
  4112. @end cartouche
  4113. @c ---------------------------------------------------------------------
  4114. @c C Extensions
  4115. @c ---------------------------------------------------------------------
  4116. @include c-extensions.texi
  4117. @c ---------------------------------------------------------------------
  4118. @c Appendices
  4119. @c ---------------------------------------------------------------------
  4120. @c ---------------------------------------------------------------------
  4121. @c Full source code for the 'Scaling a Vector' example
  4122. @c ---------------------------------------------------------------------
  4123. @node Full source code for the 'Scaling a Vector' example
  4124. @appendix Full source code for the 'Scaling a Vector' example
  4125. @menu
  4126. * Main application::
  4127. * CPU Kernel::
  4128. * CUDA Kernel::
  4129. * OpenCL Kernel::
  4130. @end menu
  4131. @node Main application
  4132. @section Main application
  4133. @include vector_scal_c.texi
  4134. @node CPU Kernel
  4135. @section CPU Kernel
  4136. @include vector_scal_cpu.texi
  4137. @node CUDA Kernel
  4138. @section CUDA Kernel
  4139. @include vector_scal_cuda.texi
  4140. @node OpenCL Kernel
  4141. @section OpenCL Kernel
  4142. @menu
  4143. * Invoking the kernel::
  4144. * Source of the kernel::
  4145. @end menu
  4146. @node Invoking the kernel
  4147. @subsection Invoking the kernel
  4148. @include vector_scal_opencl.texi
  4149. @node Source of the kernel
  4150. @subsection Source of the kernel
  4151. @include vector_scal_opencl_codelet.texi
  4152. @node GNU Free Documentation License
  4153. @appendix GNU Free Documentation License
  4154. @include fdl-1.3.texi
  4155. @c
  4156. @c Indices.
  4157. @c
  4158. @node Function Index
  4159. @unnumbered Function Index
  4160. @printindex fn
  4161. @bye