123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588 |
- \input texinfo @c -*-texinfo-*-
- @c %**start of header
- @setfilename starpu.info
- @settitle StarPU
- @c %**end of header
- @setchapternewpage odd
- @titlepage
- @title StarPU
- @page
- @vskip 0pt plus 1filll
- @comment For the @value{version-GCC} Version*
- @end titlepage
- @summarycontents
- @contents
- @page
- @node Top
- @top Preface
- @cindex Preface
- This manual documents the usage of StarPU.
- @comment
- @comment When you add a new menu item, please keep the right hand
- @comment aligned to the same column. Do not use tabs. This provides
- @comment better formatting.
- @comment
- @menu
- * Introduction:: A basic introduction to using StarPU
- * Installing StarPU:: How to configure, build and install StarPU
- * Using StarPU:: How to run StarPU application
- * Basic Examples:: Basic examples of the use of StarPU
- * Configuring StarPU:: How to configure StarPU
- * StarPU API:: The API to use StarPU
- * Advanced Topics:: Advanced use of StarPU
- * Full source code for the 'Scaling a Vector' example::
- @end menu
- @c ---------------------------------------------------------------------
- @c Introduction to StarPU
- @c ---------------------------------------------------------------------
- @node Introduction
- @chapter Introduction to StarPU
- @menu
- * Motivation:: Why StarPU ?
- * StarPU in a Nutshell:: The Fundamentals of StarPU
- @end menu
- @node Motivation
- @section Motivation
- @c complex machines with heterogeneous cores/devices
- The use of specialized hardware such as accelerators or coprocessors offers an
- interesting approach to overcome the physical limits encountered by processor
- architects. As a result, many machines are now equipped with one or several
- accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
- efforts have been devoted to offload computation onto such accelerators, very
- little attention as been paid to portability concerns on the one hand, and to the
- possibility of having heterogeneous accelerators and processors to interact on the other hand.
- StarPU is a runtime system that offers support for heterogeneous multicore
- architectures, it not only offers a unified view of the computational resources
- (i.e. CPUs and accelerators at the same time), but it also takes care of
- efficiently mapping and executing tasks onto an heterogeneous machine while
- transparently handling low-level issues in a portable fashion.
- @c this leads to a complicated distributed memory design
- @c which is not (easily) manageable by hand
- @c added value/benefits of StarPU
- @c - portability
- @c - scheduling, perf. portability
- @node StarPU in a Nutshell
- @section StarPU in a Nutshell
- @menu
- * Codelet and Tasks::
- * StarPU Data Management Library::
- * Research Papers::
- @end menu
- From a programming point of view, StarPU is not a new language but a library
- that executes tasks explicitly submitted by the application. The data that a
- task manipulates are automatically transferred onto the accelerator so that the
- programmer does not have to take care of complex data movements. StarPU also
- takes particular care of scheduling those tasks efficiently and allows
- scheduling experts to implement custom scheduling policies in a portable
- fashion.
- @c explain the notion of codelet and task (i.e. g(A, B)
- @node Codelet and Tasks
- @subsection Codelet and Tasks
- One of StarPU primary data structure is the @b{codelet}. A codelet describes a
- computational kernel that can possibly be implemented on multiple architectures
- such as a CPU, a CUDA device or a Cell's SPU.
- @c TODO insert illustration f : f_spu, f_cpu, ...
- Another important data structure is the @b{task}. Executing a StarPU task
- consists in applying a codelet on a data set, on one of the architectures on
- which the codelet is implemented. In addition to the codelet that a task
- implements, it also describes which data are accessed, and how they are
- accessed during the computation (read and/or write).
- StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
- operation. The task structure can also specify a @b{callback} function that is
- called once StarPU has properly executed the task. It also contains optional
- fields that the application may use to give hints to the scheduler (such as
- priority levels).
- A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
- Task dependencies can be enforced either by the means of callback functions, or
- by expressing dependencies between tags.
- @c TODO insert illustration f(Ar, Brw, Cr) + ..
- @c DSM
- @node StarPU Data Management Library
- @subsection StarPU Data Management Library
- Because StarPU schedules tasks at runtime, data transfers have to be
- done automatically and ``just-in-time'' between processing units,
- relieving the application programmer from explicit data transfers.
- Moreover, to avoid unnecessary transfers, StarPU keeps data
- where it was last needed, even if was modified there, and it
- allows multiple copies of the same data to reside at the same time on
- several processing units as long as it is not modified.
- @node Research Papers
- @subsection Research Papers
- Research papers on StarPU can be found on
- @indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}
- Notably a good overview in the research report
- @indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}
- @c ---------------------------------------------------------------------
- @c Installing StarPU
- @c ---------------------------------------------------------------------
- @node Installing StarPU
- @chapter Installing StarPU
- @menu
- * Downloading StarPU::
- * Configuration of StarPU::
- * Building and Installing StarPU::
- @end menu
- StarPU can be built and installed by the standard means of the GNU
- autotools. The following chapter is intended to briefly remind how these tools
- can be used to install StarPU.
- @node Downloading StarPU
- @section Downloading StarPU
- @menu
- * Getting Sources::
- * Optional dependencies::
- @end menu
- @node Getting Sources
- @subsection Getting Sources
- The source code is managed by a Subversion server hosted by the
- InriaGforge. To get the source code, you need:
- @itemize
- @item
- To install the client side of the software Subversion if it is
- not already available on your system. The software can be obtained from
- @indicateurl{http://subversion.tigris.org}.
- @item
- You can check out the project's SVN repository through anonymous
- access. This will provide you with a read access to the
- repository.
- You can also choose to become a member of the project @code{starpu}.
- For this, you first need to get an account to the gForge server. You
- can then send a request to join the project
- (@indicateurl{https://gforge.inria.fr/project/request.php?group_id=1570}).
- @item
- More information on how to get a gForge account, to become a member of
- a project, or on any other related task can be obtained from the
- InriaGforge at @indicateurl{https://gforge.inria.fr/}. The most important
- thing is to upload your public SSH key on the gForge server (see the
- FAQ at @indicateurl{http://siteadmin.gforge.inria.fr/FAQ.html#Q6} for
- instructions).
- @end itemize
- You can now check out the latest version from the Subversion server:
- @itemize
- @item
- using the anonymous access via svn:
- @example
- % svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
- @end example
- @item
- using the anonymous access via https:
- @example
- % svn checkout --username anonsvn https://scm.gforge.inria.fr/svn/starpu/trunk
- @end example
- The password is @code{anonsvn}.
- @item
- using your gForge account
- @example
- % svn checkout svn+ssh://<login>@@scm.gforge.inria.fr/svn/starpu/trunk
- @end example
- @end itemize
- These steps require to run autoconf and automake to generate the
- @code{./configure} script. This can be done by calling
- @code{./autogen.sh}. The required version for autoconf is 2.60 or
- higher.
- @example
- % ./autogen.sh
- @end example
- If the autotools are not available on your machine or not recent
- enough, you can choose to download the latest nightly tarball, which
- is provided with a @code{configure} script.
- @example
- % wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
- @end example
- @node Optional dependencies
- @subsection Optional dependencies
- The topology discovery library, hwloc, is not mandatory to use StarPU
- but strongly recommended. It allows to increase performance, and to
- perform some topology aware scheduling.
- hwloc is available in major distributions and for most OSes and can be
- downloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.
- @node Configuration of StarPU
- @section Configuration of StarPU
- @menu
- * Generating Makefiles and configuration scripts::
- * Running the configuration::
- @end menu
- @node Generating Makefiles and configuration scripts
- @subsection Generating Makefiles and configuration scripts
- This step is not necessary when using the tarball releases of StarPU. If you
- are using the source code from the svn repository, you first need to generate
- the configure scripts and the Makefiles.
- @example
- % ./autogen.sh
- @end example
- @node Running the configuration
- @subsection Running the configuration
- @example
- % ./configure
- @end example
- Details about options that are useful to give to @code{./configure} are given in
- @ref{Compilation configuration}.
- @node Building and Installing StarPU
- @section Building and Installing StarPU
- @menu
- * Building::
- * Sanity Checks::
- * Installing::
- @end menu
- @node Building
- @subsection Building
- @example
- % make
- @end example
- @node Sanity Checks
- @subsection Sanity Checks
- In order to make sure that StarPU is working properly on the system, it is also
- possible to run a test suite.
- @example
- % make check
- @end example
- @node Installing
- @subsection Installing
- In order to install StarPU at the location that was specified during
- configuration:
- @example
- % make install
- @end example
- @c ---------------------------------------------------------------------
- @c Using StarPU
- @c ---------------------------------------------------------------------
- @node Using StarPU
- @chapter Using StarPU
- @menu
- * Setting flags for compiling and linking applications::
- * Running a basic StarPU application::
- @end menu
- @node Setting flags for compiling and linking applications
- @section Setting flags for compiling and linking applications
- Compiling and linking an application against StarPU may require to use
- specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
- To this end, it is possible to use the @code{pkg-config} tool.
- If StarPU was not installed at some standard location, the path of StarPU's
- library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
- that @code{pkg-config} can find it. For example if StarPU was installed in
- @code{$prefix_dir}:
- @example
- % PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
- @end example
- The flags required to compile or link against StarPU are then
- accessible with the following commands:
- @example
- % pkg-config --cflags libstarpu # options for the compiler
- % pkg-config --libs libstarpu # options for the linker
- @end example
- @node Running a basic StarPU application
- @section Running a basic StarPU application
- Basic examples using StarPU have been built in the directory
- @code{$prefix_dir/lib/starpu/examples/}. You can for example run the
- example @code{vector_scal}.
- @example
- % $prefix_dir/lib/starpu/examples/vector_scal
- BEFORE : First element was 1.000000
- AFTER First element is 3.140000
- %
- @end example
- When StarPU is used for the first time, the directory
- @code{$HOME/.starpu/} is created, performance models will be stored in
- that directory.
- Please note that buses are benchmarked when StarPU is launched for the
- first time. This may take a few minutes, or less if @code{hwloc} is
- installed. This step is done only once per user and per machine.
- @c ---------------------------------------------------------------------
- @c Basic Examples
- @c ---------------------------------------------------------------------
- @node Basic Examples
- @chapter Basic Examples
- @menu
- * Compiling and linking options::
- * Hello World:: Submitting Tasks
- * Scaling a Vector:: Manipulating Data
- * Vector Scaling on an Hybrid CPU/GPU Machine:: Handling Heterogeneous Architectures
- * Task and Worker Profiling::
- * Partitioning Data:: Partitioning Data
- * Performance model example::
- * Theoretical lower bound on execution time::
- * More examples:: More examples shipped with StarPU
- @end menu
- @node Compiling and linking options
- @section Compiling and linking options
- Let's suppose StarPU has been installed in the directory
- @code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
- the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
- necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
- libraries at runtime.
- @example
- % PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
- % LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
- @end example
- The Makefile could for instance contain the following lines to define which
- options must be given to the compiler and to the linker:
- @cartouche
- @example
- CFLAGS += $$(pkg-config --cflags libstarpu)
- LDFLAGS += $$(pkg-config --libs libstarpu)
- @end example
- @end cartouche
- @node Hello World
- @section Hello World
- @menu
- * Required Headers::
- * Defining a Codelet::
- * Submitting a Task::
- * Execution of Hello World::
- @end menu
- In this section, we show how to implement a simple program that submits a task to StarPU.
- @node Required Headers
- @subsection Required Headers
- The @code{starpu.h} header should be included in any code using StarPU.
- @cartouche
- @smallexample
- #include <starpu.h>
- @end smallexample
- @end cartouche
- @node Defining a Codelet
- @subsection Defining a Codelet
- @cartouche
- @smallexample
- void cpu_func(void *buffers[], void *cl_arg)
- @{
- float *array = cl_arg;
- printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
- @}
- starpu_codelet cl =
- @{
- .where = STARPU_CPU,
- .cpu_func = cpu_func,
- .nbuffers = 0
- @};
- @end smallexample
- @end cartouche
- A codelet is a structure that represents a computational kernel. Such a codelet
- may contain an implementation of the same kernel on different architectures
- (e.g. CUDA, Cell's SPU, x86, ...).
- The @code{nbuffers} field specifies the number of data buffers that are
- manipulated by the codelet: here the codelet does not access or modify any data
- that is controlled by our data management library. Note that the argument
- passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
- structure) does not count as a buffer since it is not managed by our data
- management library.
- @c TODO need a crossref to the proper description of "where" see bla for more ...
- We create a codelet which may only be executed on the CPUs. The @code{where}
- field is a bitmask that defines where the codelet may be executed. Here, the
- @code{STARPU_CPU} value means that only CPUs can execute this codelet
- (@pxref{Codelets and Tasks} for more details on this field).
- When a CPU core executes a codelet, it calls the @code{cpu_func} function,
- which @emph{must} have the following prototype:
- @code{void (*cpu_func)(void *buffers[], void *cl_arg);}
- In this example, we can ignore the first argument of this function which gives a
- description of the input and output buffers (e.g. the size and the location of
- the matrices). The second argument is a pointer to a buffer passed as an
- argument to the codelet by the means of the @code{cl_arg} field of the
- @code{starpu_task} structure.
- @c TODO rewrite so that it is a little clearer ?
- Be aware that this may be a pointer to a
- @emph{copy} of the actual buffer, and not the pointer given by the programmer:
- if the codelet modifies this buffer, there is no guarantee that the initial
- buffer will be modified as well: this for instance implies that the buffer
- cannot be used as a synchronization medium.
- @node Submitting a Task
- @subsection Submitting a Task
- @cartouche
- @smallexample
- void callback_func(void *callback_arg)
- @{
- printf("Callback function (arg %x)\n", callback_arg);
- @}
- int main(int argc, char **argv)
- @{
- /* @b{initialize StarPU} */
- starpu_init(NULL);
- struct starpu_task *task = starpu_task_create();
- task->cl = &cl; /* @b{Pointer to the codelet defined above} */
- float array[2] = @{1.0f, -1.0f@};
- task->cl_arg = &array;
- task->cl_arg_size = sizeof(array);
- task->callback_func = callback_func;
- task->callback_arg = 0x42;
- /* @b{starpu_task_submit will be a blocking call} */
- task->synchronous = 1;
- /* @b{submit the task to StarPU} */
- starpu_task_submit(task);
- /* @b{terminate StarPU} */
- starpu_shutdown();
- return 0;
- @}
- @end smallexample
- @end cartouche
- Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
- @code{NULL} argument specifies that we use default configuration. Tasks cannot
- be submitted after the termination of StarPU by a call to
- @code{starpu_shutdown}.
- In the example above, a task structure is allocated by a call to
- @code{starpu_task_create}. This function only allocates and fills the
- corresponding structure with the default settings (@pxref{starpu_task_create}),
- but it does not submit the task to StarPU.
- @c not really clear ;)
- The @code{cl} field is a pointer to the codelet which the task will
- execute: in other words, the codelet structure describes which computational
- kernel should be offloaded on the different architectures, and the task
- structure is a wrapper containing a codelet and the piece of data on which the
- codelet should operate.
- The optional @code{cl_arg} field is a pointer to a buffer (of size
- @code{cl_arg_size}) with some parameters for the kernel
- described by the codelet. For instance, if a codelet implements a computational
- kernel that multiplies its input vector by a constant, the constant could be
- specified by the means of this buffer, instead of registering it.
- Once a task has been executed, an optional callback function can be called.
- While the computational kernel could be offloaded on various architectures, the
- callback function is always executed on a CPU. The @code{callback_arg}
- pointer is passed as an argument of the callback. The prototype of a callback
- function must be:
- @code{void (*callback_function)(void *);}
- If the @code{synchronous} field is non-null, task submission will be
- synchronous: the @code{starpu_task_submit} function will not return until the
- task was executed. Note that the @code{starpu_shutdown} method does not
- guarantee that asynchronous tasks have been executed before it returns.
- @node Execution of Hello World
- @subsection Execution of Hello World
- @smallexample
- % make hello_world
- cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) hello_world.c -o hello_world
- % ./hello_world
- Hello world (array = @{1.000000, -1.000000@} )
- Callback function (arg 42)
- @end smallexample
- @node Scaling a Vector
- @section Manipulating Data: Scaling a Vector
- The previous example has shown how to submit tasks. In this section,
- we show how StarPU tasks can manipulate data. The full source code for
- this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
- @menu
- * Source code of Vector Scaling::
- * Execution of Vector Scaling::
- @end menu
- @node Source code of Vector Scaling
- @subsection Source code of Vector Scaling
- Programmers can describe the data layout of their application so that StarPU is
- responsible for enforcing data coherency and availability across the machine.
- Instead of handling complex (and non-portable) mechanisms to perform data
- movements, programmers only declare which piece of data is accessed and/or
- modified by a task, and StarPU makes sure that when a computational kernel
- starts somewhere (e.g. on a GPU), its data are available locally.
- Before submitting those tasks, the programmer first needs to declare the
- different pieces of data to StarPU using the @code{starpu_*_data_register}
- functions. To ease the development of applications for StarPU, it is possible
- to describe multiple types of data layout. A type of data layout is called an
- @b{interface}. By default, there are different interfaces available in StarPU:
- here we will consider the @b{vector interface}.
- The following lines show how to declare an array of @code{NX} elements of type
- @code{float} using the vector interface:
- @cartouche
- @smallexample
- float vector[NX];
- starpu_data_handle vector_handle;
- starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
- sizeof(vector[0]));
- @end smallexample
- @end cartouche
- The first argument, called the @b{data handle}, is an opaque pointer which
- designates the array in StarPU. This is also the structure which is used to
- describe which data is used by a task. The second argument is the node number
- where the data currently resides. Here it is 0 since the @code{vector} array is in
- the main memory. Then comes the pointer @code{vector} where the data can be found,
- the number of elements in the vector and the size of each element.
- It is possible to construct a StarPU task that will manipulate the
- vector and a constant factor.
- @cartouche
- @smallexample
- float factor = 3.14;
- struct starpu_task *task = starpu_task_create();
- task->cl = &cl; /* @b{Pointer to the codelet defined below} */
- task->buffers[0].handle = vector_handle; /* @b{First parameter of the codelet} */
- task->buffers[0].mode = STARPU_RW;
- task->cl_arg = &factor;
- task->cl_arg_size = sizeof(factor);
- task->synchronous = 1;
- starpu_task_submit(task);
- @end smallexample
- @end cartouche
- Since the factor is a mere float value parameter, it does not need a preliminary registration, and
- can just be passed through the @code{cl_arg} pointer like in the previous
- example. The vector parameter is described by its handle.
- There are two fields in each element of the @code{buffers} array.
- @code{handle} is the handle of the data, and @code{mode} specifies how the
- kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
- write-only and @code{STARPU_RW} for read and write access).
- The definition of the codelet can be written as follows:
- @cartouche
- @smallexample
- void scal_cpu_func(void *buffers[], void *cl_arg)
- @{
- unsigned i;
- float *factor = cl_arg;
- /* length of the vector */
- unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
- /* local copy of the vector pointer */
- float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
- for (i = 0; i < n; i++)
- val[i] *= *factor;
- @}
- starpu_codelet cl = @{
- .where = STARPU_CPU,
- .cpu_func = scal_cpu_func,
- .nbuffers = 1
- @};
- @end smallexample
- @end cartouche
- The second argument of the @code{scal_cpu_func} function contains a pointer to the
- parameters of the codelet (given in @code{task->cl_arg}), so that we read the
- constant factor from this pointer. The first argument is an array that gives
- a description of all the buffers passed in the @code{task->buffers}@ array. The
- size of this array is given by the @code{nbuffers} field of the codelet
- structure. For the sake of generality, this array contains pointers to the
- different interfaces describing each buffer. In the case of the @b{vector
- interface}, the location of the vector (resp. its length) is accessible in the
- @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
- read-write fashion, any modification will automatically affect future accesses
- to this vector made by other tasks.
- @node Execution of Vector Scaling
- @subsection Execution of Vector Scaling
- @smallexample
- % make vector_scal
- cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) vector_scal.c -o vector_scal
- % ./vector_scal
- 0.000000 3.000000 6.000000 9.000000 12.000000
- @end smallexample
- @node Vector Scaling on an Hybrid CPU/GPU Machine
- @section Vector Scaling on an Hybrid CPU/GPU Machine
- Contrary to the previous examples, the task submitted in this example may not
- only be executed by the CPUs, but also by a CUDA device.
- @menu
- * Definition of the CUDA Codelet::
- * Definition of the OpenCL Codelet::
- * Definition of the Main Code::
- * Execution of Hybrid Vector Scaling::
- @end menu
- @node Definition of the CUDA Codelet
- @subsection Definition of the CUDA Codelet
- The CUDA implementation can be written as follows. It needs to be
- compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
- driver.
- @cartouche
- @smallexample
- #include <starpu.h>
- static __global__ void vector_mult_cuda(float *val, unsigned n,
- float factor)
- @{
- unsigned i;
- for(i = 0 ; i < n ; i++)
- val[i] *= factor;
- @}
- extern "C" void scal_cuda_func(void *buffers[], void *_args)
- @{
- float *factor = (float *)_args;
- /* length of the vector */
- unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
- /* local copy of the vector pointer */
- float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
- @i{ vector_mult_cuda<<<1,1>>>(val, n, *factor);}
- @i{ cudaThreadSynchronize();}
- @}
- @end smallexample
- @end cartouche
- @node Definition of the OpenCL Codelet
- @subsection Definition of the OpenCL Codelet
- The OpenCL implementation can be written as follows. StarPU provides
- tools to compile a OpenCL codelet stored in a file.
- @cartouche
- @smallexample
- __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
- @{
- const int i = get_global_id(0);
- if (i < nx) @{
- val[i] *= factor;
- @}
- @}
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- #include <starpu.h>
- @i{#include <starpu_opencl.h>}
- @i{extern struct starpu_opencl_program programs;}
- void scal_opencl_func(void *buffers[], void *_args)
- @{
- float *factor = _args;
- @i{ int id, devid, err;}
- @i{ cl_kernel kernel;}
- @i{ cl_command_queue queue;}
- /* length of the vector */
- unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
- /* local copy of the vector pointer */
- float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
- @i{ id = starpu_worker_get_id();}
- @i{ devid = starpu_worker_get_devid(id);}
- @i{ err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
- @i{ "vector_mult_opencl", devid); /* @b{Name of the codelet defined above} */}
- @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
- @i{ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);}
- @i{ err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
- @i{ err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
- @i{ if (err) STARPU_OPENCL_REPORT_ERROR(err);}
- @i{ @{}
- @i{ size_t global=1;}
- @i{ size_t local=1;}
- @i{ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);}
- @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
- @i{ @}}
- @i{ clFinish(queue);}
- @i{ starpu_opencl_release_kernel(kernel);}
- @}
- @end smallexample
- @end cartouche
- @node Definition of the Main Code
- @subsection Definition of the Main Code
- The CPU implementation is the same as in the previous section.
- Here is the source of the main application. You can notice the value of the
- field @code{where} for the codelet. We specify
- @code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
- can be executed either on a CPU or on a CUDA or an OpenCL device.
- @cartouche
- @smallexample
- #include <starpu.h>
- #define NX 2048
- extern void scal_cuda_func(void *buffers[], void *_args);
- extern void scal_cpu_func(void *buffers[], void *_args);
- extern void scal_opencl_func(void *buffers[], void *_args);
- /* @b{Definition of the codelet} */
- static starpu_codelet cl = @{
- .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
- /* @b{on a CUDA device, or on an OpenCL device} */
- .cuda_func = scal_cuda_func;
- .cpu_func = scal_cpu_func;
- .opencl_func = scal_opencl_func;
- .nbuffers = 1;
- @}
- #ifdef STARPU_USE_OPENCL
- /* @b{The compiled version of the OpenCL program} */
- struct starpu_opencl_program programs;
- #endif
- int main(int argc, char **argv)
- @{
- float *vector;
- int i, ret;
- float factor=3.0;
- struct starpu_task *task;
- starpu_data_handle vector_handle;
- starpu_init(NULL); /* @b{Initialising StarPU} */
- #ifdef STARPU_USE_OPENCL
- starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_codelet.cl",
- &programs);
- #endif
- vector = malloc(NX*sizeof(vector[0]));
- assert(vector);
- for(i=0 ; i<NX ; i++) vector[i] = i;
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- /* @b{Registering data within StarPU} */
- starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
- NX, sizeof(vector[0]));
- /* @b{Definition of the task} */
- task = starpu_task_create();
- task->cl = &cl;
- task->buffers[0].handle = vector_handle;
- task->buffers[0].mode = STARPU_RW;
- task->cl_arg = &factor;
- task->cl_arg_size = sizeof(factor);
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- /* @b{Submitting the task} */
- ret = starpu_task_submit(task);
- if (ret == -ENODEV) @{
- fprintf(stderr, "No worker may execute this task\n");
- return 1;
- @}
- /* @b{Waiting for its termination} */
- starpu_task_wait_for_all();
- /* @b{Update the vector in RAM} */
- starpu_data_acquire(vector_handle, STARPU_R);
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- /* @b{Access the data} */
- for(i=0 ; i<NX; i++) @{
- fprintf(stderr, "%f ", vector[i]);
- @}
- fprintf(stderr, "\n");
- /* @b{Release the data and shutdown StarPU} */
- starpu_data_release(vector_handle);
- starpu_shutdown();
- return 0;
- @}
- @end smallexample
- @end cartouche
- @node Execution of Hybrid Vector Scaling
- @subsection Execution of Hybrid Vector Scaling
- The Makefile given at the beginning of the section must be extended to
- give the rules to compile the CUDA source code. Note that the source
- file of the OpenCL codelet does not need to be compiled now, it will
- be compiled at run-time when calling the function
- @code{starpu_opencl_load_opencl_from_file} (@pxref{starpu_opencl_load_opencl_from_file}).
- @cartouche
- @smallexample
- CFLAGS += $(shell pkg-config --cflags libstarpu)
- LDFLAGS += $(shell pkg-config --libs libstarpu)
- CC = gcc
- vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
- %.o: %.cu
- nvcc $(CFLAGS) $< -c $@
- clean:
- rm -f vector_scal *.o
- @end smallexample
- @end cartouche
- @smallexample
- % make
- @end smallexample
- and to execute it, with the default configuration:
- @smallexample
- % ./vector_scal
- 0.000000 3.000000 6.000000 9.000000 12.000000
- @end smallexample
- or for example, by disabling CPU devices:
- @smallexample
- % STARPU_NCPUS=0 ./vector_scal
- 0.000000 3.000000 6.000000 9.000000 12.000000
- @end smallexample
- or by disabling CUDA devices:
- @smallexample
- % STARPU_NCUDA=0 ./vector_scal
- 0.000000 3.000000 6.000000 9.000000 12.000000
- @end smallexample
- @node Task and Worker Profiling
- @section Task and Worker Profiling
- A full example showing how to use the profiling API is available in
- the StarPU sources in the directory @code{examples/profiling/}.
- @cartouche
- @smallexample
- struct starpu_task *task = starpu_task_create();
- task->cl = &cl;
- task->synchronous = 1;
- /* We will destroy the task structure by hand so that we can
- * query the profiling info before the task is destroyed. */
- task->destroy = 0;
- starpu_task_submit(task);
- /* The task is finished, get profiling information */
- struct starpu_task_profiling_info *info = task->profiling_info;
- /* How much time did it take before the task started ? */
- double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
- /* How long was the task execution ? */
- double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
- /* We don't need the task structure anymore */
- starpu_task_destroy(task);
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- /* Display the occupancy of all workers during the test */
- int worker;
- for (worker = 0; worker < starpu_worker_get_count(); worker++)
- @{
- struct starpu_worker_profiling_info worker_info;
- int ret = starpu_worker_get_profiling_info(worker, &worker_info);
- STARPU_ASSERT(!ret);
- double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
- double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
- double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
- float executing_ratio = 100.0*executing_time/total_time;
- float sleeping_ratio = 100.0*sleeping_time/total_time;
- char workername[128];
- starpu_worker_get_name(worker, workername, 128);
- fprintf(stderr, "Worker %s:\n", workername);
- fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
- fprintf(stderr, "\texec time : %.2lf ms (%.2f %%)\n", executing_time*1e-3,
- executing_ratio);
- fprintf(stderr, "\tblocked time : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
- sleeping_ratio);
- @}
- @end smallexample
- @end cartouche
- @node Partitioning Data
- @section Partitioning Data
- An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
- @cartouche
- @smallexample
- int vector[NX];
- starpu_data_handle handle;
- /* Declare data to StarPU */
- starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
- /* Partition the vector in PARTS sub-vectors */
- starpu_filter f =
- @{
- .filter_func = starpu_block_filter_func_vector,
- .nchildren = PARTS,
- .get_nchildren = NULL,
- .get_child_ops = NULL
- @};
- starpu_data_partition(handle, &f);
- @end smallexample
- @end cartouche
- @cartouche
- @smallexample
- /* Submit a task on each sub-vector */
- for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
- starpu_data_handle sub_handle = starpu_data_get_sub_data(handle, 1, i);
- struct starpu_task *task = starpu_task_create();
- task->buffers[0].handle = sub_handle;
- task->buffers[0].mode = STARPU_RW;
- task->cl = &cl;
- task->synchronous = 1;
- task->cl_arg = &factor;
- task->cl_arg_size = sizeof(factor);
- starpu_task_submit(task);
- @}
- @end smallexample
- @end cartouche
- Partitioning can be applied several times, see
- @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
- @node Performance model example
- @section Performance model example
- To achieve good scheduling, StarPU scheduling policies need to be able to
- estimate in advance the duration of a task. This is done by giving to codelets a
- performance model. There are several kinds of performance models.
- @itemize
- @item
- Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
- see for instance
- @code{examples/common/blas_model.c} and @code{examples/common/blas_model.h}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
- @item
- Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
- given set of data input/output sizes, the performance will always be about the
- same. This is very true for regular kernels on GPUs for instance (<0.1% error),
- and just a bit less true on CPUs (~=1% error). This also assumes that there are
- few different sets of data input/output sizes. StarPU will then keep record of
- the average time of previous executions on the various processing units, and use
- it as an estimation. It will also save it in @code{~/.starpu/sampling/codelets}
- for further executions. The following is a small code example.
- @cartouche
- @smallexample
- static struct starpu_perfmodel_t mult_perf_model = @{
- .type = STARPU_HISTORY_BASED,
- .symbol = "mult_perf_model"
- @};
- starpu_codelet cl = @{
- .where = STARPU_CPU,
- .cpu_func = cpu_mult,
- .nbuffers = 3,
- /* for the scheduling policy to be able to use performance models */
- .model = &mult_perf_model
- @};
- @end smallexample
- @end cartouche
- @item
- Measured at runtime and refined by regression (STARPU_REGRESSION_BASED model
- type). This still assumes performance regularity, but can work with various data
- input sizes, by applying a*n^b+c regression over observed execution times.
- @end itemize
- @node Theoretical lower bound on execution time
- @section Theoretical lower bound on execution time
- For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
- bound for the execution time of a whole set of tasks. See for
- instance @code{examples/lu/lu_example.c}: before submitting tasks,
- call @code{starpu_bound_start}, and after complete execution, call
- @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
- @code{starpu_bound_print_mps} can then be used to output a Linear Programming
- problem corresponding to the schedule of your tasks. Run it through
- @code{lp_solve} or any other linear programming solver, and that will give you a
- lower bound for the total execution time of your tasks. If StarPU was compiled
- with the glpk library installed, @code{starpu_bound_compute} can be used to
- solve it immediately and get the optimized minimum. Its @code{integer}
- parameter allows to decide whether integer resolution should be computed
- and returned.
- The @code{deps} parameter tells StarPU whether to take tasks and implicit data
- dependencies into account. It must be understood that the linear programming
- problem size is quadratic with the number of tasks and thus the time to solve it
- will be very long, it could be minutes for just a few dozen tasks. You should
- probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
- problem to MPS format and then use a better solver, @code{glpsol} might be
- better than @code{lp_solve} for instance (the @code{--pcost} option may be
- useful), but sometimes doesn't manage to converge. @code{cbc} might look
- slower, but it is parallel. Be sure to try at least all the @code{-B} options
- of @code{lp_solve}. For instance, we often just use
- @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
- the @code{-gr} option can also be quite useful.
- Setting @code{deps} to 0 will only take into account the actual computations
- on processing units. It however still properly takes into account the varying
- performances of kernels and processing units, which is quite more accurate than
- just comparing StarPU performances with the fastest of the kernels being used.
- The @code{prio} parameter tells StarPU whether to simulate taking into account
- the priorities as the StarPU scheduler would, i.e. schedule prioritized
- tasks before less prioritized tasks, to check to which extend this results
- to a less optimal solution. This increases even more computation time.
- Note that all this however doesn't take into account data transfer, which is
- assumed to be completely overlapped.
- @node More examples
- @section More examples
- More examples are available in the StarPU sources in the @code{examples/}
- directory. Simple examples include:
- @table @asis
- @item @code{incrementer/}:
- Trivial incrementation test.
- @item @code{basic_examples/}:
- Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
- in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
- product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
- interface, and an example using the variable data interface.
- @item @code{matvecmult/}:
- OpenCL example from NVidia, adapted to StarPU.
- @item @code{axpy/}:
- AXPY CUBLAS operation adapted to StarPU.
- @item @code{fortran/}:
- Example of Fortran bindings.
- @end table
- More advanced examples include:
- @table @asis
- @item @code{filters/}:
- Examples using filters, as shown in @ref{Partitioning Data}.
- @item @code{lu/}:
- LU matrix factorization.
- @end table
- @c ---------------------------------------------------------------------
- @c Configuration options
- @c ---------------------------------------------------------------------
- @node Configuring StarPU
- @chapter Configuring StarPU
- @menu
- * Compilation configuration::
- * Execution configuration through environment variables::
- @end menu
- @node Compilation configuration
- @section Compilation configuration
- The following arguments can be given to the @code{configure} script.
- @menu
- * Common configuration::
- * Configuring workers::
- * Advanced configuration::
- @end menu
- @node Common configuration
- @subsection Common configuration
- @menu
- * --enable-debug::
- * --enable-fast::
- * --enable-verbose::
- * --enable-coverage::
- @end menu
- @node --enable-debug
- @subsubsection @code{--enable-debug}
- @table @asis
- @item @emph{Description}:
- Enable debugging messages.
- @end table
- @node --enable-fast
- @subsubsection @code{--enable-fast}
- @table @asis
- @item @emph{Description}:
- Do not enforce assertions, saves a lot of time spent to compute them otherwise.
- @end table
- @node --enable-verbose
- @subsubsection @code{--enable-verbose}
- @table @asis
- @item @emph{Description}:
- Augment the verbosity of the debugging messages.
- @end table
- @node --enable-coverage
- @subsubsection @code{--enable-coverage}
- @table @asis
- @item @emph{Description}:
- Enable flags for the coverage tool.
- @end table
- @node Configuring workers
- @subsection Configuring workers
- @menu
- * --enable-nmaxcpus::
- * --disable-cpu::
- * --enable-maxcudadev::
- * --disable-cuda::
- * --with-cuda-dir::
- * --enable-maxopencldev::
- * --disable-opencl::
- * --with-opencl-dir::
- * --enable-gordon::
- * --with-gordon-dir::
- @end menu
- @node --enable-nmaxcpus
- @subsubsection @code{--enable-nmaxcpus=<number>}
- @table @asis
- @item @emph{Description}:
- Defines the maximum number of CPU cores that StarPU will support, then
- available as the @code{STARPU_NMAXCPUS} macro.
- @end table
- @node --disable-cpu
- @subsubsection @code{--disable-cpu}
- @table @asis
- @item @emph{Description}:
- Disable the use of CPUs of the machine. Only GPUs etc. will be used.
- @end table
- @node --enable-maxcudadev
- @subsubsection @code{--enable-maxcudadev=<number>}
- @table @asis
- @item @emph{Description}:
- Defines the maximum number of CUDA devices that StarPU will support, then
- available as the @code{STARPU_MAXCUDADEVS} macro.
- @end table
- @node --disable-cuda
- @subsubsection @code{--disable-cuda}
- @table @asis
- @item @emph{Description}:
- Disable the use of CUDA, even if a valid CUDA installation was detected.
- @end table
- @node --with-cuda-dir
- @subsubsection @code{--with-cuda-dir=<path>}
- @table @asis
- @item @emph{Description}:
- Specify the directory where CUDA is installed. This directory should notably contain
- @code{include/cuda.h}.
- @end table
- @node --enable-maxopencldev
- @subsubsection @code{--enable-maxopencldev=<number>}
- @table @asis
- @item @emph{Description}:
- Defines the maximum number of OpenCL devices that StarPU will support, then
- available as the @code{STARPU_MAXOPENCLDEVS} macro.
- @end table
- @node --disable-opencl
- @subsubsection @code{--disable-opencl}
- @table @asis
- @item @emph{Description}:
- Disable the use of OpenCL, even if the SDK is detected.
- @end table
- @node --with-opencl-dir
- @subsubsection @code{--with-opencl-dir=<path>}
- @table @asis
- @item @emph{Description}:
- Specify the location of the OpenCL SDK. This directory should notably contain
- @code{include/CL/cl.h}.
- @end table
- @node --enable-gordon
- @subsubsection @code{--enable-gordon}
- @table @asis
- @item @emph{Description}:
- Enable the use of the Gordon runtime for Cell SPUs.
- @c TODO: rather default to enabled when detected
- @end table
- @node --with-gordon-dir
- @subsubsection @code{--with-gordon-dir=<path>}
- @table @asis
- @item @emph{Description}:
- Specify the location of the Gordon SDK.
- @end table
- @node Advanced configuration
- @subsection Advanced configuration
- @menu
- * --enable-perf-debug::
- * --enable-model-debug::
- * --enable-stats::
- * --enable-maxbuffers::
- * --enable-allocation-cache::
- * --enable-opengl-render::
- * --enable-blas-lib::
- * --with-magma::
- * --with-fxt::
- * --with-perf-model-dir::
- * --with-mpicc::
- * --with-goto-dir::
- * --with-atlas-dir::
- @end menu
- @node --enable-perf-debug
- @subsubsection @code{--enable-perf-debug}
- @table @asis
- @item @emph{Description}:
- Enable performance debugging.
- @end table
- @node --enable-model-debug
- @subsubsection @code{--enable-model-debug}
- @table @asis
- @item @emph{Description}:
- Enable performance model debugging.
- @end table
- @node --enable-stats
- @subsubsection @code{--enable-stats}
- @table @asis
- @item @emph{Description}:
- Enable statistics.
- @end table
- @node --enable-maxbuffers
- @subsubsection @code{--enable-maxbuffers=<nbuffers>}
- @table @asis
- @item @emph{Description}:
- Define the maximum number of buffers that tasks will be able to take
- as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
- @end table
- @node --enable-allocation-cache
- @subsubsection @code{--enable-allocation-cache}
- @table @asis
- @item @emph{Description}:
- Enable the use of a data allocation cache to avoid the cost of it with
- CUDA. Still experimental.
- @end table
- @node --enable-opengl-render
- @subsubsection @code{--enable-opengl-render}
- @table @asis
- @item @emph{Description}:
- Enable the use of OpenGL for the rendering of some examples.
- @c TODO: rather default to enabled when detected
- @end table
- @node --enable-blas-lib
- @subsubsection @code{--enable-blas-lib=<name>}
- @table @asis
- @item @emph{Description}:
- Specify the blas library to be used by some of the examples. The
- library has to be 'atlas' or 'goto'.
- @end table
- @node --with-magma
- @subsubsection @code{--with-magma=<path>}
- @table @asis
- @item @emph{Description}:
- Specify where magma is installed.
- @end table
- @node --with-fxt
- @subsubsection @code{--with-fxt=<path>}
- @table @asis
- @item @emph{Description}:
- Specify the location of FxT (for generating traces and rendering them
- using ViTE). This directory should notably contain
- @code{include/fxt/fxt.h}.
- @end table
- @node --with-perf-model-dir
- @subsubsection @code{--with-perf-model-dir=<dir>}
- @table @asis
- @item @emph{Description}:
- Specify where performance models should be stored (instead of defaulting to the
- current user's home).
- @end table
- @node --with-mpicc
- @subsubsection @code{--with-mpicc=<path to mpicc>}
- @table @asis
- @item @emph{Description}:
- Specify the location of the @code{mpicc} compiler to be used for starpumpi.
- @end table
- @node --with-goto-dir
- @subsubsection @code{--with-goto-dir=<dir>}
- @table @asis
- @item @emph{Description}:
- Specify the location of GotoBLAS.
- @end table
- @node --with-atlas-dir
- @subsubsection @code{--with-atlas-dir=<dir>}
- @table @asis
- @item @emph{Description}:
- Specify the location of ATLAS. This directory should notably contain
- @code{include/cblas.h}.
- @end table
- @c ---------------------------------------------------------------------
- @c Environment variables
- @c ---------------------------------------------------------------------
- @node Execution configuration through environment variables
- @section Execution configuration through environment variables
- @menu
- * Workers:: Configuring workers
- * Scheduling:: Configuring the Scheduling engine
- * Misc:: Miscellaneous and debug
- @end menu
- Note: the values given in @code{starpu_conf} structure passed when
- calling @code{starpu_init} will override the values of the environment
- variables.
- @node Workers
- @subsection Configuring workers
- @menu
- * STARPU_NCPUS:: Number of CPU workers
- * STARPU_NCUDA:: Number of CUDA workers
- * STARPU_NOPENCL:: Number of OpenCL workers
- * STARPU_NGORDON:: Number of SPU workers (Cell)
- * STARPU_WORKERS_CPUID:: Bind workers to specific CPUs
- * STARPU_WORKERS_CUDAID:: Select specific CUDA devices
- * STARPU_WORKERS_OPENCLID:: Select specific OpenCL devices
- @end menu
- @node STARPU_NCPUS
- @subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
- @table @asis
- @item @emph{Description}:
- Specify the maximum number of CPU workers. Note that StarPU will not allocate
- more CPUs than there are physical CPUs, and that some CPUs are used to control
- the accelerators.
- @end table
- @node STARPU_NCUDA
- @subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
- @table @asis
- @item @emph{Description}:
- Specify the maximum number of CUDA devices that StarPU can use. If
- @code{STARPU_NCUDA} is lower than the number of physical devices, it is
- possible to select which CUDA devices should be used by the means of the
- @code{STARPU_WORKERS_CUDAID} environment variable.
- @end table
- @node STARPU_NOPENCL
- @subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
- @table @asis
- @item @emph{Description}:
- OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
- @end table
- @node STARPU_NGORDON
- @subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
- @table @asis
- @item @emph{Description}:
- Specify the maximum number of SPUs that StarPU can use.
- @end table
- @node STARPU_WORKERS_CPUID
- @subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
- @table @asis
- @item @emph{Description}:
- Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
- specifies on which logical CPU the different workers should be
- bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
- worker will be bound to logical CPU #1, the second CPU worker will be bound to
- logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
- determined by the OS, or provided by the @code{hwloc} library in case it is
- available.
- Note that the first workers correspond to the CUDA workers, then come the
- OpenCL and the SPU, and finally the CPU workers. For example if
- we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
- and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
- by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
- the logical CPUs #1 and #3 will be used by the CPU workers.
- If the number of workers is larger than the array given in
- @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
- round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
- third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
- This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
- @code{starpu_conf} structure passed to @code{starpu_init} is set.
- @end table
- @node STARPU_WORKERS_CUDAID
- @subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
- @table @asis
- @item @emph{Description}:
- Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
- possible to select which CUDA devices should be used by StarPU. On a machine
- equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
- @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
- they should use CUDA devices #1 and #3 (the logical ordering of the devices is
- the one reported by CUDA).
- This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
- the @code{starpu_conf} structure passed to @code{starpu_init} is set.
- @end table
- @node STARPU_WORKERS_OPENCLID
- @subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
- @table @asis
- @item @emph{Description}:
- OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
- This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
- the @code{starpu_conf} structure passed to @code{starpu_init} is set.
- @end table
- @node Scheduling
- @subsection Configuring the Scheduling engine
- @menu
- * STARPU_SCHED:: Scheduling policy
- * STARPU_CALIBRATE:: Calibrate performance models
- * STARPU_PREFETCH:: Use data prefetch
- * STARPU_SCHED_ALPHA:: Computation factor
- * STARPU_SCHED_BETA:: Communication factor
- @end menu
- @node STARPU_SCHED
- @subsubsection @code{STARPU_SCHED} -- Scheduling policy
- @table @asis
- @item @emph{Description}:
- This chooses between the different scheduling policies proposed by StarPU: work
- random, stealing, greedy, with performance models, etc.
- Use @code{STARPU_SCHED=help} to get the list of available schedulers.
- @end table
- @node STARPU_CALIBRATE
- @subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
- @table @asis
- @item @emph{Description}:
- If this variable is set to 1, the performance models are calibrated during
- the execution. If it is set to 2, the previous values are dropped to restart
- calibration from scratch.
- Note: this currently only applies to dm and dmda scheduling policies.
- @end table
- @node STARPU_PREFETCH
- @subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
- @table @asis
- @item @emph{Description}:
- This variable indicates whether data prefetching should be enabled (0 means
- that it is disabled). If prefetching is enabled, when a task is scheduled to be
- executed e.g. on a GPU, StarPU will request an asynchronous transfer in
- advance, so that data is already present on the GPU when the task starts. As a
- result, computation and data transfers are overlapped.
- @end table
- @node STARPU_SCHED_ALPHA
- @subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
- @table @asis
- @item @emph{Description}:
- To estimate the cost of a task StarPU takes into account the estimated
- computation time (obtained thanks to performance models). The alpha factor is
- the coefficient to be applied to it before adding it to the communication part.
- @end table
- @node STARPU_SCHED_BETA
- @subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
- @table @asis
- @item @emph{Description}:
- To estimate the cost of a task StarPU takes into account the estimated
- data transfer time (obtained thanks to performance models). The beta factor is
- the coefficient to be applied to it before adding it to the computation part.
- @end table
- @node Misc
- @subsection Miscellaneous and debug
- @menu
- * STARPU_LOGFILENAME:: Select debug file name
- @end menu
- @node STARPU_LOGFILENAME
- @subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
- @table @asis
- @item @emph{Description}:
- This variable specify in which file the debugging output should be saved to.
- @end table
- @c ---------------------------------------------------------------------
- @c StarPU API
- @c ---------------------------------------------------------------------
- @node StarPU API
- @chapter StarPU API
- @menu
- * Initialization and Termination:: Initialization and Termination methods
- * Workers' Properties:: Methods to enumerate workers' properties
- * Data Library:: Methods to manipulate data
- * Data Interfaces::
- * Data Partition::
- * Codelets and Tasks:: Methods to construct tasks
- * Explicit Dependencies:: Explicit Dependencies
- * Implicit Data Dependencies:: Implicit Data Dependencies
- * Performance Model API::
- * Profiling API:: Profiling API
- * CUDA extensions:: CUDA extensions
- * OpenCL extensions:: OpenCL extensions
- * Cell extensions:: Cell extensions
- * Miscellaneous helpers::
- @end menu
- @node Initialization and Termination
- @section Initialization and Termination
- @menu
- * starpu_init:: Initialize StarPU
- * struct starpu_conf:: StarPU runtime configuration
- * starpu_shutdown:: Terminate StarPU
- @end menu
- @node starpu_init
- @subsection @code{starpu_init} -- Initialize StarPU
- @table @asis
- @item @emph{Description}:
- This is StarPU initialization method, which must be called prior to any other
- StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
- policy, number of cores, ...) by passing a non-null argument. Default
- configuration is used if the passed argument is @code{NULL}.
- @item @emph{Return value}:
- Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
- indicates that no worker was available (so that StarPU was not initialized).
- @item @emph{Prototype}:
- @code{int starpu_init(struct starpu_conf *conf);}
- @end table
- @node struct starpu_conf
- @subsection @code{struct starpu_conf} -- StarPU runtime configuration
- @table @asis
- @item @emph{Description}:
- This structure is passed to the @code{starpu_init} function in order
- to configure StarPU.
- When the default value is used, StarPU automatically selects the number
- of processing units and takes the default scheduling policy. This parameter
- overwrites the equivalent environment variables.
- @item @emph{Fields}:
- @table @asis
- @item @code{sched_policy_name} (default = NULL):
- This is the name of the scheduling policy. This can also be specified with the
- @code{STARPU_SCHED} environment variable.
- @item @code{sched_policy} (default = NULL):
- This is the definition of the scheduling policy. This field is ignored
- if @code{sched_policy_name} is set.
- @item @code{ncpus} (default = -1):
- This is the maximum number of CPU cores that StarPU can use. This can also be
- specified with the @code{STARPU_NCPUS} environment variable.
- @item @code{ncuda} (default = -1):
- This is the maximum number of CUDA devices that StarPU can use. This can also be
- specified with the @code{STARPU_NCUDA} environment variable.
- @item @code{nopencl} (default = -1):
- This is the maximum number of OpenCL devices that StarPU can use. This can also be
- specified with the @code{STARPU_NOPENCL} environment variable.
- @item @code{nspus} (default = -1):
- This is the maximum number of Cell SPUs that StarPU can use. This can also be
- specified with the @code{STARPU_NGORDON} environment variable.
- @item @code{use_explicit_workers_bindid} (default = 0)
- If this flag is set, the @code{workers_bindid} array indicates where the
- different workers are bound, otherwise StarPU automatically selects where to
- bind the different workers unless the @code{STARPU_WORKERS_CPUID} environment
- variable is set. The @code{STARPU_WORKERS_CPUID} environment variable is
- ignored if the @code{use_explicit_workers_bindid} flag is set.
- @item @code{workers_bindid[STARPU_NMAXWORKERS]}
- If the @code{use_explicit_workers_bindid} flag is set, this array indicates
- where to bind the different workers. The i-th entry of the
- @code{workers_bindid} indicates the logical identifier of the processor which
- should execute the i-th worker. Note that the logical ordering of the CPUs is
- either determined by the OS, or provided by the @code{hwloc} library in case it
- is available.
- When this flag is set, the @ref{STARPU_WORKERS_CPUID} environment variable is
- ignored.
-
- @item @code{use_explicit_workers_cuda_gpuid} (default = 0)
- If this flag is set, the CUDA workers will be attached to the CUDA devices
- specified in the @code{workers_cuda_gpuid} array. Otherwise, StarPU affects the
- CUDA devices in a round-robin fashion.
- When this flag is set, the @ref{STARPU_WORKERS_CUDAID} environment variable is
- ignored.
- @item @code{workers_cuda_gpuid[STARPU_NMAXWORKERS]}
- If the @code{use_explicit_workers_cuda_gpuid} flag is set, this array contains
- the logical identifiers of the CUDA devices (as used by @code{cudaGetDevice}).
- @item @code{use_explicit_workers_opencl_gpuid} (default = 0)
- If this flag is set, the OpenCL workers will be attached to the OpenCL devices
- specified in the @code{workers_opencl_gpuid} array. Otherwise, StarPU affects the
- OpenCL devices in a round-robin fashion.
- @item @code{workers_opencl_gpuid[STARPU_NMAXWORKERS]}:
- @item @code{calibrate} (default = 0):
- If this flag is set, StarPU will calibrate the performance models when
- executing tasks. If this value is equal to -1, the default value is used. The
- default value is overwritten by the @code{STARPU_CALIBRATE} environment
- variable when it is set.
- @end table
- @end table
- @node starpu_shutdown
- @subsection @code{starpu_shutdown} -- Terminate StarPU
- @table @asis
- @item @emph{Description}:
- This is StarPU termination method. It must be called at the end of the
- application: statistics and other post-mortem debugging information are not
- guaranteed to be available until this method has been called.
- @item @emph{Prototype}:
- @code{void starpu_shutdown(void);}
- @end table
- @node Workers' Properties
- @section Workers' Properties
- @menu
- * starpu_worker_get_count:: Get the number of processing units
- * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
- * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
- * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
- * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
- * starpu_worker_get_id:: Get the identifier of the current worker
- * starpu_worker_get_devid:: Get the device identifier of a worker
- * starpu_worker_get_type:: Get the type of processing unit associated to a worker
- * starpu_worker_get_name:: Get the name of a worker
- * starpu_worker_get_memory_node:: Get the memory node of a worker
- @end menu
- @node starpu_worker_get_count
- @subsection @code{starpu_worker_get_count} -- Get the number of processing units
- @table @asis
- @item @emph{Description}:
- This function returns the number of workers (i.e. processing units executing
- StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
- @item @emph{Prototype}:
- @code{unsigned starpu_worker_get_count(void);}
- @end table
- @node starpu_cpu_worker_get_count
- @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
- @table @asis
- @item @emph{Description}:
- This function returns the number of CPUs controlled by StarPU. The returned
- value should be at most @code{STARPU_NMAXCPUS}.
- @item @emph{Prototype}:
- @code{unsigned starpu_cpu_worker_get_count(void);}
- @end table
- @node starpu_cuda_worker_get_count
- @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
- @table @asis
- @item @emph{Description}:
- This function returns the number of CUDA devices controlled by StarPU. The returned
- value should be at most @code{STARPU_MAXCUDADEVS}.
- @item @emph{Prototype}:
- @code{unsigned starpu_cuda_worker_get_count(void);}
- @end table
- @node starpu_opencl_worker_get_count
- @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
- @table @asis
- @item @emph{Description}:
- This function returns the number of OpenCL devices controlled by StarPU. The returned
- value should be at most @code{STARPU_MAXOPENCLDEVS}.
- @item @emph{Prototype}:
- @code{unsigned starpu_opencl_worker_get_count(void);}
- @end table
- @node starpu_spu_worker_get_count
- @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
- @table @asis
- @item @emph{Description}:
- This function returns the number of Cell SPUs controlled by StarPU.
- @item @emph{Prototype}:
- @code{unsigned starpu_opencl_worker_get_count(void);}
- @end table
- @node starpu_worker_get_id
- @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
- @table @asis
- @item @emph{Description}:
- This function returns the identifier of the worker associated to the calling
- thread. The returned value is either -1 if the current context is not a StarPU
- worker (i.e. when called from the application outside a task or a callback), or
- an integer between 0 and @code{starpu_worker_get_count() - 1}.
- @item @emph{Prototype}:
- @code{int starpu_worker_get_id(void);}
- @end table
- @node starpu_worker_get_devid
- @subsection @code{starpu_worker_get_devid} -- Get the device identifier of a worker
- @table @asis
- @item @emph{Description}:
- This functions returns the device id of the worker associated to an identifier
- (as returned by the @code{starpu_worker_get_id} function). In the case of a
- CUDA worker, this device identifier is the logical device identifier exposed by
- CUDA (used by the @code{cudaGetDevice} function for instance). The device
- identifier of a CPU worker is the logical identifier of the core on which the
- worker was bound; this identifier is either provided by the OS or by the
- @code{hwloc} library in case it is available.
- @item @emph{Prototype}:
- @code{int starpu_worker_get_devid(int id);}
- @end table
- @node starpu_worker_get_type
- @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
- @table @asis
- @item @emph{Description}:
- This function returns the type of worker associated to an identifier (as
- returned by the @code{starpu_worker_get_id} function). The returned value
- indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
- core, @code{STARPU_CUDA_WORKER} for a CUDA device,
- @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
- @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
- identifier is unspecified.
- @item @emph{Prototype}:
- @code{enum starpu_archtype starpu_worker_get_type(int id);}
- @end table
- @node starpu_worker_get_name
- @subsection @code{starpu_worker_get_name} -- Get the name of a worker
- @table @asis
- @item @emph{Description}:
- StarPU associates a unique human readable string to each processing unit. This
- function copies at most the @code{maxlen} first bytes of the unique string
- associated to a worker identified by its identifier @code{id} into the
- @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
- is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
- function on an invalid identifier results in an unspecified behaviour.
- @item @emph{Prototype}:
- @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
- @end table
- @node starpu_worker_get_memory_node
- @subsection @code{starpu_worker_get_memory_node} -- Get the memory node of a worker
- @table @asis
- @item @emph{Description}:
- This function returns the identifier of the memory node associated to the
- worker identified by @code{workerid}.
- @item @emph{Prototype}:
- @code{unsigned starpu_worker_get_memory_node(unsigned workerid);}
- @end table
- @node Data Library
- @section Data Library
- This section describes the data management facilities provided by StarPU.
- We show how to use existing data interfaces in @ref{Data Interfaces}, but developers can
- design their own data interfaces if required.
- @menu
- * starpu_access_mode:: starpu_access_mode
- * unsigned memory_node:: Memory node
- * starpu_data_handle:: StarPU opaque data handle
- * void *interface:: StarPU data interface
- * starpu_data_register:: Register a piece of data to StarPU
- * starpu_data_unregister:: Unregister a piece of data from StarPU
- * starpu_data_invalidate:: Invalidate all data replicates
- * starpu_data_acquire:: Access registered data from the application
- * starpu_data_acquire_cb:: Access registered data from the application asynchronously
- * starpu_data_release:: Release registered data from the application
- @end menu
- @node starpu_access_mode
- @subsection @code{starpu_access_mode} -- Data access mode
- This datatype describes a data access mode. The different available modes are:
- @table @asis
- @table @asis
- @item @code{STARPU_R} read-only mode.
- @item @code{STARPU_W} write-only mode.
- @item @code{STARPU_RW} read-write mode. This is equivalent to @code{STARPU_R|STARPU_W}.
- @item @code{STARPU_SCRATCH} scratch memory. A temporary buffer is allocated for the task, but StarPU does not enforce data consistency.
- @end table
- @end table
- @node unsigned memory_node
- @subsection @code{unsigned memory_node} -- Memory node
- @table @asis
- @item @emph{Description}:
- Every worker is associated to a memory node which is a logical abstraction of
- the address space from which the processing unit gets its data. For instance,
- the memory node associated to the different CPU workers represents main memory
- (RAM), the memory node associated to a GPU is DRAM embedded on the device.
- Every memory node is identified by a logical index which is accessible from the
- @code{starpu_worker_get_memory_node} function. When registering a piece of data
- to StarPU, the specified memory node indicates where the piece of data
- initially resides (we also call this memory node the home node of a piece of
- data).
- @end table
- @node starpu_data_handle
- @subsection @code{starpu_data_handle} -- StarPU opaque data handle
- @table @asis
- @item @emph{Description}:
- StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
- data. Once a piece of data has been registered to StarPU, it is associated to a
- @code{starpu_data_handle} which keeps track of the state of the piece of data
- over the entire machine, so that we can maintain data consistency and locate
- data replicates for instance.
- @end table
- @node void *interface
- @subsection @code{void *interface} -- StarPU data interface
- @table @asis
- @item @emph{Description}:
- Data management is done at a high-level in StarPU: rather than accessing a mere
- list of contiguous buffers, the tasks may manipulate data that are described by
- a high-level construct which we call data interface.
- An example of data interface is the "vector" interface which describes a
- contiguous data array on a spefic memory node. This interface is a simple
- structure containing the number of elements in the array, the size of the
- elements, and the address of the array in the appropriate address space (this
- address may be invalid if there is no valid copy of the array in the memory
- node). More informations on the data interfaces provided by StarPU are
- given in @ref{Data Interfaces}.
- When a piece of data managed by StarPU is used by a task, the task
- implementation is given a pointer to an interface describing a valid copy of
- the data that is accessible from the current processing unit.
- @end table
- @node starpu_data_register
- @subsection @code{starpu_data_register} -- Register a piece of data to StarPU
- @table @asis
- @item @emph{Description}:
- Register a piece of data into the handle located at the @code{handleptr}
- address. The @code{interface} buffer contains the initial description of the
- data in the home node. The @code{ops} argument is a pointer to a structure
- describing the different methods used to manipulate this type of interface. See
- @ref{struct starpu_data_interface_ops_t} for more details on this structure.
- If @code{home_node} is not a valid memory node, StarPU will automatically
- allocate the memory described by the interface the data handle is used for the
- first time in write-only mode. Once such data handle has been automatically
- allocated, it is possible to access it using any access mode.
- Note that StarPU supplies a set of predefined types of interface (e.g. vector or
- matrix) which can be registered by the means of helper functions (e.g.
- @code{starpu_vector_data_register} or @code{starpu_matrix_data_register}).
- @item @emph{Prototype}:
- @code{void starpu_data_register(starpu_data_handle *handleptr,
- uint32_t home_node,
- void *interface,
- struct starpu_data_interface_ops_t *ops);}
- @end table
- @node starpu_data_unregister
- @subsection @code{starpu_data_unregister} -- Unregister a piece of data from StarPU
- @table @asis
- @item @emph{Description}:
- This function unregisters a data handle from StarPU. If the data was
- automatically allocated by StarPU because the home node was not valid, all
- automatically allocated buffers are freed. Otherwise, a valid copy of the data
- is put back into the home node in the buffer that was initially registered.
- Using a data handle that has been unregistered from StarPU results in an
- undefined behaviour.
- @item @emph{Prototype}:
- @code{void starpu_data_unregister(starpu_data_handle handle);}
- @end table
- @node starpu_data_invalidate
- @subsection @code{starpu_data_invalidate} -- Invalidate all data replicates
- @table @asis
- @item @emph{Description}:
- Destroy all replicates of the data handle. After data invalidation, the first
- access to the handle must be performed in write-only mode. Accessing an
- invalidated data in read-mode results in undefined behaviour.
- @item @emph{Prototype}:
- @code{void starpu_data_invalidate(starpu_data_handle handle);}
- @end table
- @c TODO create a specific sections about user interaction with the DSM ?
- @node starpu_data_acquire
- @subsection @code{starpu_data_acquire} -- Access registered data from the application
- @table @asis
- @item @emph{Description}:
- The application must call this function prior to accessing registered data from
- main memory outside tasks. StarPU ensures that the application will get an
- up-to-date copy of the data in main memory located where the data was
- originally registered, and that all concurrent accesses (e.g. from tasks) will
- be consistent with the access mode specified in the @code{mode} argument.
- @code{starpu_data_release} must be called once the application does not need to
- access the piece of data anymore.
- Note that implicit data dependencies are also enforced by
- @code{starpu_data_acquire} in case they are enabled.
- @code{starpu_data_acquire} is a blocking call, so that it cannot be called from
- tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
- @code{-EDEADLK}). Upon successful completion, this function returns 0.
- @item @emph{Prototype}:
- @code{int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode);}
- @end table
- @node starpu_data_acquire_cb
- @subsection @code{starpu_data_acquire_cb} -- Access registered data from the application asynchronously
- @table @asis
- @item @emph{Description}:
- @code{starpu_data_acquire_cb} is the asynchronous equivalent of
- @code{starpu_data_release}. When the data specified in the first argument is
- available in the appropriate access mode, the callback function is executed.
- The application may access the requested data during the execution of this
- callback. The callback function must call @code{starpu_data_release} once the
- application does not need to access the piece of data anymore.
- Note that implicit data dependencies are also enforced by
- @code{starpu_data_acquire} in case they are enabled.
- Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
- be called from task callbacks. Upon successful completion, this function
- returns 0.
- @item @emph{Prototype}:
- @code{int starpu_data_acquire_cb(starpu_data_handle handle, starpu_access_mode mode, void (*callback)(void *), void *arg);}
- @end table
- @node starpu_data_release
- @subsection @code{starpu_data_release} -- Release registered data from the application
- @table @asis
- @item @emph{Description}:
- This function releases the piece of data acquired by the application either by
- @code{starpu_data_acquire} or by @code{starpu_data_acquire_cb}.
- @item @emph{Prototype}:
- @code{void starpu_data_release(starpu_data_handle handle);}
- @end table
- @node Data Interfaces
- @section Data Interfaces
- @menu
- * Variable Interface::
- * Vector Interface::
- * Matrix Interface::
- * BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)::
- * CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)::
- * Block Interface::
- @end menu
- @node Variable Interface
- @subsection Variable Interface
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_variable_data_register(starpu_data_handle *handle,
- uint32_t home_node,
- uintptr_t ptr, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- float var;
- starpu_data_handle var_handle;
- starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));
- @end smallexample
- @end cartouche
- @end table
- @node Vector Interface
- @subsection Vector Interface
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_vector_data_register(starpu_data_handle *handle, uint32_t home_node,
- uintptr_t ptr, uint32_t nx, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- float vector[NX];
- starpu_data_handle vector_handle;
- starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
- sizeof(vector[0]));
- @end smallexample
- @end cartouche
- @end table
- @node Matrix Interface
- @subsection Matrix Interface
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_matrix_data_register(starpu_data_handle *handle, uint32_t home_node,
- uintptr_t ptr, uint32_t ld, uint32_t nx,
- uint32_t ny, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- float *matrix;
- starpu_data_handle matrix_handle;
- matrix = (float*)malloc(width * height * sizeof(float));
- starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,
- width, width, height, sizeof(float));
- @end smallexample
- @end cartouche
- @end table
- @node BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
- @subsection BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_bcsr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
- uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- @end smallexample
- @end cartouche
- @end table
- @node CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
- @subsection CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_csr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
- uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- @end smallexample
- @end cartouche
- @end table
- @node Block Interface
- @subsection Block Interface
- @table @asis
- @item @emph{Description}:
- @item @emph{Prototype}:
- @code{void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,
- uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
- uint32_t ny, uint32_t nz, size_t elemsize);}
- @item @emph{Example}:
- @cartouche
- @smallexample
- float *block;
- starpu_data_handle block_handle;
- block = (float*)malloc(nx*ny*nz*sizeof(float));
- starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
- nx, nx*ny, nx, ny, nz, sizeof(float));
- @end smallexample
- @end cartouche
- @end table
- @node Data Partition
- @section Data Partition
- @menu
- * struct starpu_data_filter:: StarPU filter structure
- * starpu_data_partition:: Partition Data
- * starpu_data_unpartition:: Unpartition Data
- * starpu_data_get_nb_children::
- * starpu_data_get_sub_data::
- * Predefined filter functions::
- @end menu
- @node struct starpu_data_filter
- @subsection @code{struct starpu_data_filter} -- StarPU filter structure
- @table @asis
- @item @emph{Description}:
- The filter structure describes a data partitioning function.
- @item @emph{Fields}:
- @table @asis
- @item @code{filter_func}:
- TODO
- @code{void (*filter_func)(void *father_interface, void* child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts);}
- @item @code{get_nchildren}:
- TODO
- @code{unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle initial_handle);}
- @item @code{get_child_ops}:
- TODO
- @code{struct starpu_data_interface_ops_t *(*get_child_ops)(struct starpu_data_filter *, unsigned id);}
- @item @code{filter_arg}:
- TODO
- @item @code{nchildren}:
- TODO
- @item @code{filter_arg_ptr}:
- TODO
- @end table
- @end table
- @node starpu_data_partition
- @subsection starpu_data_partition -- Partition Data
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f);}
- @end table
- @node starpu_data_unpartition
- @subsection starpu_data_unpartition -- Unpartition data
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_data_unpartition(starpu_data_handle root_data, uint32_t gathering_node);}
- @end table
- @node starpu_data_get_nb_children
- @subsection starpu_data_get_nb_children
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Return value}:
- This function returns returns the number of children.
- @item @emph{Prototype}:
- @code{int starpu_data_get_nb_children(starpu_data_handle handle);}
- @end table
- @c starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i);
- @node starpu_data_get_sub_data
- @subsection starpu_data_get_sub_data
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Return value}:
- TODO
- @item @emph{Prototype}:
- @code{starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_data, unsigned depth, ... );}
- @end table
- @node Predefined filter functions
- @subsection Predefined filter functions
- @menu
- * Partitioning BCSR Data::
- * Partitioning BLAS interface::
- * Partitioning Vector Data::
- * Partitioning Block Data::
- @end menu
- This section gives a list of the predefined partitioning functions.
- Examples on how to use them are shown in @ref{Partitioning Data}.
- @node Partitioning BCSR Data
- @subsubsection Partitioning BCSR Data
- @itemize
- @item
- TODO
- @code{void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @item
- TODO
- @code{void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @end itemize
- @node Partitioning BLAS interface
- @subsubsection Partitioning BLAS interface
- @itemize
- @item
- TODO
- @code{void starpu_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @item
- TODO
- @code{void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @end itemize
- @node Partitioning Vector Data
- @subsubsection Partitioning Vector Data
- @itemize
- @item
- TODO
- @code{void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @item
- TODO
- @code{void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @item
- TODO
- @code{void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @end itemize
- @node Partitioning Block Data
- @subsubsection Partitioning Block Data
- @itemize
- @item
- TODO
- @code{void starpu_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
- @end itemize
- @node Codelets and Tasks
- @section Codelets and Tasks
- @menu
- * struct starpu_codelet:: StarPU codelet structure
- * struct starpu_task:: StarPU task structure
- * starpu_task_init:: Initialize a Task
- * starpu_task_create:: Allocate and Initialize a Task
- * starpu_task_deinit:: Release all the resources used by a Task
- * starpu_task_destroy:: Destroy a dynamically allocated Task
- * starpu_task_wait:: Wait for the termination of a Task
- * starpu_task_submit:: Submit a Task
- * starpu_task_wait_for_all:: Wait for the termination of all Tasks
- * starpu_get_current_task:: Return the task currently executed by the worker
- * starpu_display_codelet_stats:: Display statistics
- @end menu
- @node struct starpu_codelet
- @subsection @code{struct starpu_codelet} -- StarPU codelet structure
- @table @asis
- @item @emph{Description}:
- The codelet structure describes a kernel that is possibly implemented on
- various targets.
- @item @emph{Fields}:
- @table @asis
- @item @code{where}:
- Indicates which types of processing units are able to execute the codelet.
- @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
- implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
- indicates that it is only available on Cell SPUs.
- @item @code{cpu_func} (optional):
- Is a function pointer to the CPU implementation of the codelet. Its prototype
- must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
- argument being the array of data managed by the data management library, and
- the second argument is a pointer to the argument passed from the @code{cl_arg}
- field of the @code{starpu_task} structure.
- The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
- the @code{where} field, it must be non-null otherwise.
- @item @code{cuda_func} (optional):
- Is a function pointer to the CUDA implementation of the codelet. @emph{This
- must be a host-function written in the CUDA runtime API}. Its prototype must
- be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
- field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
- field, it must be non-null otherwise.
- @item @code{opencl_func} (optional):
- Is a function pointer to the OpenCL implementation of the codelet. Its
- prototype must be:
- @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
- This pointer is ignored if @code{STARPU_OPENCL} does not appear in the
- @code{where} field, it must be non-null otherwise.
- @item @code{gordon_func} (optional):
- This is the index of the Cell SPU implementation within the Gordon library.
- See Gordon documentation for more details on how to register a kernel and
- retrieve its index.
- @item @code{nbuffers}:
- Specifies the number of arguments taken by the codelet. These arguments are
- managed by the DSM and are accessed from the @code{void *buffers[]}
- array. The constant argument passed with the @code{cl_arg} field of the
- @code{starpu_task} structure is not counted in this number. This value should
- not be above @code{STARPU_NMAXBUFS}.
- @item @code{model} (optional):
- This is a pointer to the performance model associated to this codelet. This
- optional field is ignored when set to @code{NULL}. TODO
- @end table
- @end table
- @node struct starpu_task
- @subsection @code{struct starpu_task} -- StarPU task structure
- @table @asis
- @item @emph{Description}:
- The @code{starpu_task} structure describes a task that can be offloaded on the various
- processing units managed by StarPU. It instantiates a codelet. It can either be
- allocated dynamically with the @code{starpu_task_create} method, or declared
- statically. In the latter case, the programmer has to zero the
- @code{starpu_task} structure and to fill the different fields properly. The
- indicated default values correspond to the configuration of a task allocated
- with @code{starpu_task_create}.
- @item @emph{Fields}:
- @table @asis
- @item @code{cl}:
- Is a pointer to the corresponding @code{starpu_codelet} data structure. This
- describes where the kernel should be executed, and supplies the appropriate
- implementations. When set to @code{NULL}, no code is executed during the tasks,
- such empty tasks can be useful for synchronization purposes.
- @item @code{buffers}:
- Is an array of @code{starpu_buffer_descr_t} structures. It describes the
- different pieces of data accessed by the task, and how they should be accessed.
- The @code{starpu_buffer_descr_t} structure is composed of two fields, the
- @code{handle} field specifies the handle of the piece of data, and the
- @code{mode} field is the required access mode (eg @code{STARPU_RW}). The number
- of entries in this array must be specified in the @code{nbuffers} field of the
- @code{starpu_codelet} structure, and should not excede @code{STARPU_NMAXBUFS}.
- If unsufficient, this value can be set with the @code{--enable-maxbuffers}
- option when configuring StarPU.
- @item @code{cl_arg} (optional) (default = NULL):
- This pointer is passed to the codelet through the second argument
- of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
- In the specific case of the Cell processor, see the @code{cl_arg_size}
- argument.
- @item @code{cl_arg_size} (optional, Cell specific):
- In the case of the Cell processor, the @code{cl_arg} pointer is not directly
- given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
- the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
- at address @code{cl_arg}. In this case, the argument given to the SPU codelet
- is therefore not the @code{cl_arg} pointer, but the address of the buffer in
- local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
- codelets.
- @item @code{callback_func} (optional) (default = @code{NULL}):
- This is a function pointer of prototype @code{void (*f)(void *)} which
- specifies a possible callback. If this pointer is non-null, the callback
- function is executed @emph{on the host} after the execution of the task. The
- callback is passed the value contained in the @code{callback_arg} field. No
- callback is executed if the field is set to @code{NULL}.
- @item @code{callback_arg} (optional) (default = @code{NULL}):
- This is the pointer passed to the callback function. This field is ignored if
- the @code{callback_func} is set to @code{NULL}.
- @item @code{use_tag} (optional) (default = 0):
- If set, this flag indicates that the task should be associated with the tag
- contained in the @code{tag_id} field. Tag allow the application to synchronize
- with the task and to express task dependencies easily.
- @item @code{tag_id}:
- This fields contains the tag associated to the task if the @code{use_tag} field
- was set, it is ignored otherwise.
- @item @code{synchronous}:
- If this flag is set, the @code{starpu_task_submit} function is blocking and
- returns only when the task has been executed (or if no worker is able to
- process the task). Otherwise, @code{starpu_task_submit} returns immediately.
- @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
- This field indicates a level of priority for the task. This is an integer value
- that must be set between the return values of the
- @code{starpu_sched_get_min_priority} function for the least important tasks,
- and that of the @code{starpu_sched_get_max_priority} for the most important
- tasks (included). The @code{STARPU_MIN_PRIO} and @code{STARPU_MAX_PRIO} macros
- are provided for convenience and respectively returns value of
- @code{starpu_sched_get_min_priority} and @code{starpu_sched_get_max_priority}.
- Default priority is @code{STARPU_DEFAULT_PRIO}, which is always defined as 0 in
- order to allow static task initialization. Scheduling strategies that take
- priorities into account can use this parameter to take better scheduling
- decisions, but the scheduling policy may also ignore it.
- @item @code{execute_on_a_specific_worker} (default = 0):
- If this flag is set, StarPU will bypass the scheduler and directly affect this
- task to the worker specified by the @code{workerid} field.
- @item @code{workerid} (optional):
- If the @code{execute_on_a_specific_worker} field is set, this field indicates
- which is the identifier of the worker that should process this task (as
- returned by @code{starpu_worker_get_id}). This field is ignored if
- @code{execute_on_a_specific_worker} field is set to 0.
- @item @code{detach} (optional) (default = 1):
- If this flag is set, it is not possible to synchronize with the task
- by the means of @code{starpu_task_wait} later on. Internal data structures
- are only guaranteed to be freed once @code{starpu_task_wait} is called if the
- flag is not set.
- @item @code{destroy} (optional) (default = 1):
- If this flag is set, the task structure will automatically be freed, either
- after the execution of the callback if the task is detached, or during
- @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
- allocated data structures will not be freed until @code{starpu_task_destroy} is
- called explicitly. Setting this flag for a statically allocated task structure
- will result in undefined behaviour.
- @item @code{predicted} (output field):
- Predicted duration of the task. This field is only set if the scheduling
- strategy used performance models.
- @end table
- @end table
- @node starpu_task_init
- @subsection @code{starpu_task_init} -- Initialize a Task
- @table @asis
- @item @emph{Description}:
- Initialize a task structure with default values. This function is implicitly
- called by @code{starpu_task_create}. By default, tasks initialized with
- @code{starpu_task_init} must be deinitialized explicitly with
- @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
- constant @code{STARPU_TASK_INITIALIZER}.
- @item @emph{Prototype}:
- @code{void starpu_task_init(struct starpu_task *task);}
- @end table
- @node starpu_task_create
- @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
- @table @asis
- @item @emph{Description}:
- Allocate a task structure and initialize it with default values. Tasks
- allocated dynamically with @code{starpu_task_create} are automatically freed when the
- task is terminated. If the destroy flag is explicitly unset, the resources used
- by the task are freed by calling
- @code{starpu_task_destroy}.
- @item @emph{Prototype}:
- @code{struct starpu_task *starpu_task_create(void);}
- @end table
- @node starpu_task_deinit
- @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
- @table @asis
- @item @emph{Description}:
- Release all the structures automatically allocated to execute the task. This is
- called automatically by @code{starpu_task_destroy}, but the task structure itself is not
- freed. This should be used for statically allocated tasks for instance.
- @item @emph{Prototype}:
- @code{void starpu_task_deinit(struct starpu_task *task);}
- @end table
- @node starpu_task_destroy
- @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
- @table @asis
- @item @emph{Description}:
- Free the resource allocated during @code{starpu_task_create}. This function can be
- called automatically after the execution of a task by setting the
- @code{destroy} flag of the @code{starpu_task} structure (default behaviour).
- Calling this function on a statically allocated task results in an undefined
- behaviour.
- @item @emph{Prototype}:
- @code{void starpu_task_destroy(struct starpu_task *task);}
- @end table
- @node starpu_task_wait
- @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
- @table @asis
- @item @emph{Description}:
- This function blocks until the task has been executed. It is not possible to
- synchronize with a task more than once. It is not possible to wait for
- synchronous or detached tasks.
- @item @emph{Return value}:
- Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
- indicates that the specified task was either synchronous or detached.
- @item @emph{Prototype}:
- @code{int starpu_task_wait(struct starpu_task *task);}
- @end table
- @node starpu_task_submit
- @subsection @code{starpu_task_submit} -- Submit a Task
- @table @asis
- @item @emph{Description}:
- This function submits a task to StarPU. Calling this function does
- not mean that the task will be executed immediately as there can be data or task
- (tag) dependencies that are not fulfilled yet: StarPU will take care of
- scheduling this task with respect to such dependencies.
- This function returns immediately if the @code{synchronous} field of the
- @code{starpu_task} structure was set to 0, and block until the termination of
- the task otherwise. It is also possible to synchronize the application with
- asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
- function for instance.
- @item @emph{Return value}:
- In case of success, this function returns 0, a return value of @code{-ENODEV}
- means that there is no worker able to process this task (e.g. there is no GPU
- available and this task is only implemented for CUDA devices).
- @item @emph{Prototype}:
- @code{int starpu_task_submit(struct starpu_task *task);}
- @end table
- @node starpu_task_wait_for_all
- @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
- @table @asis
- @item @emph{Description}:
- This function blocks until all the tasks that were submitted are terminated.
- @item @emph{Prototype}:
- @code{void starpu_task_wait_for_all(void);}
- @end table
- @node starpu_get_current_task
- @subsection @code{starpu_get_current_task} -- Return the task currently executed by the worker
- @table @asis
- @item @emph{Description}:
- This function returns the task currently executed by the worker, or
- NULL if it is called either from a thread that is not a task or simply
- because there is no task being executed at the moment.
- @item @emph{Prototype}:
- @code{struct starpu_task *starpu_get_current_task(void);}
- @end table
- @node starpu_display_codelet_stats
- @subsection @code{starpu_display_codelet_stats} -- Display statistics
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_display_codelet_stats(struct starpu_codelet_t *cl);}
- @end table
- @c Callbacks : what can we put in callbacks ?
- @node Explicit Dependencies
- @section Explicit Dependencies
- @menu
- * starpu_task_declare_deps_array:: starpu_task_declare_deps_array
- * starpu_tag_t:: Task logical identifier
- * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
- * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
- * starpu_tag_wait:: Block until a Tag is terminated
- * starpu_tag_wait_array:: Block until a set of Tags is terminated
- * starpu_tag_remove:: Destroy a Tag
- * starpu_tag_notify_from_apps:: Feed a tag explicitly
- @end menu
- @node starpu_task_declare_deps_array
- @subsection @code{starpu_task_declare_deps_array} -- Declare task dependencies
- @table @asis
- @item @emph{Description}:
- Declare task dependencies between a @code{task} and an array of tasks of length
- @code{ndeps}. This function must be called prior to the submission of the task,
- but it may called after the submission or the execution of the tasks in the
- array provided the tasks are still valid (ie. they were not automatically
- destroyed). Calling this function on a task that was already submitted or with
- an entry of @code{task_array} that is not a valid task anymore results in an
- undefined behaviour. If @code{ndeps} is null, no dependency is added. It is
- possible to call @code{starpu_task_declare_deps_array} multiple times on the
- same task, in this case, the dependencies are added. It is possible to have
- redundancy in the task dependencies.
- @item @emph{Prototype}:
- @code{void starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, struct starpu_task *task_array[]);}
- @end table
- @node starpu_tag_t
- @subsection @code{starpu_tag_t} -- Task logical identifier
- @table @asis
- @item @emph{Description}:
- It is possible to associate a task with a unique ``tag'' and to express
- dependencies between tasks by the means of those tags. To do so, fill the
- @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
- be arbitrary) and set the @code{use_tag} field to 1.
- If @code{starpu_tag_declare_deps} is called with this tag number, the task will
- not be started until the tasks which holds the declared dependency tags are
- completed.
- @end table
- @node starpu_tag_declare_deps
- @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
- @table @asis
- @item @emph{Description}:
- Specify the dependencies of the task identified by tag @code{id}. The first
- argument specifies the tag which is configured, the second argument gives the
- number of tag(s) on which @code{id} depends. The following arguments are the
- tags which have to be terminated to unlock the task.
- This function must be called before the associated task is submitted to StarPU
- with @code{starpu_task_submit}.
- @item @emph{Remark}
- Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
- last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
- typically need to be explicitly casted. Using the
- @code{starpu_tag_declare_deps_array} function avoids this hazard.
- @item @emph{Prototype}:
- @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
- @item @emph{Example}:
- @cartouche
- @example
- /* Tag 0x1 depends on tags 0x32 and 0x52 */
- starpu_tag_declare_deps((starpu_tag_t)0x1,
- 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
- @end example
- @end cartouche
- @end table
- @node starpu_tag_declare_deps_array
- @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
- @table @asis
- @item @emph{Description}:
- This function is similar to @code{starpu_tag_declare_deps}, except that its
- does not take a variable number of arguments but an array of tags of size
- @code{ndeps}.
- @item @emph{Prototype}:
- @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
- @item @emph{Example}:
- @cartouche
- @example
- /* Tag 0x1 depends on tags 0x32 and 0x52 */
- starpu_tag_t tag_array[2] = @{0x32, 0x52@};
- starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
- @end example
- @end cartouche
- @end table
- @node starpu_tag_wait
- @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
- @table @asis
- @item @emph{Description}:
- This function blocks until the task associated to tag @code{id} has been
- executed. This is a blocking call which must therefore not be called within
- tasks or callbacks, but only from the application directly. It is possible to
- synchronize with the same tag multiple times, as long as the
- @code{starpu_tag_remove} function is not called. Note that it is still
- possible to synchronize with a tag associated to a task which @code{starpu_task}
- data structure was freed (e.g. if the @code{destroy} flag of the
- @code{starpu_task} was enabled).
- @item @emph{Prototype}:
- @code{void starpu_tag_wait(starpu_tag_t id);}
- @end table
- @node starpu_tag_wait_array
- @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
- @table @asis
- @item @emph{Description}:
- This function is similar to @code{starpu_tag_wait} except that it blocks until
- @emph{all} the @code{ntags} tags contained in the @code{id} array are
- terminated.
- @item @emph{Prototype}:
- @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
- @end table
- @node starpu_tag_remove
- @subsection @code{starpu_tag_remove} -- Destroy a Tag
- @table @asis
- @item @emph{Description}:
- This function releases the resources associated to tag @code{id}. It can be
- called once the corresponding task has been executed and when there is
- no other tag that depend on this tag anymore.
- @item @emph{Prototype}:
- @code{void starpu_tag_remove(starpu_tag_t id);}
- @end table
- @node starpu_tag_notify_from_apps
- @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
- @table @asis
- @item @emph{Description}:
- This function explicitly unlocks tag @code{id}. It may be useful in the
- case of applications which execute part of their computation outside StarPU
- tasks (e.g. third-party libraries). It is also provided as a
- convenient tool for the programmer, for instance to entirely construct the task
- DAG before actually giving StarPU the opportunity to execute the tasks.
- @item @emph{Prototype}:
- @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
- @end table
- @node Implicit Data Dependencies
- @section Implicit Data Dependencies
- @menu
- * starpu_data_set_default_sequential_consistency_flag:: starpu_data_set_default_sequential_consistency_flag
- * starpu_data_get_default_sequential_consistency_flag:: starpu_data_get_default_sequential_consistency_flag
- * starpu_data_set_sequential_consistency_flag:: starpu_data_set_sequential_consistency_flag
- @end menu
- In this section, we describe how StarPU makes it possible to insert implicit
- task dependencies in order to enforce sequential data consistency. When this
- data consistency is enabled on a specific data handle, any data access will
- appear as sequentially consistent from the application. For instance, if the
- application submits two tasks that access the same piece of data in read-only
- mode, and then a third task that access it in write mode, dependencies will be
- added between the two first tasks and the third one. Implicit data dependencies
- are also inserted in the case of data accesses from the application.
- @node starpu_data_set_default_sequential_consistency_flag
- @subsection @code{starpu_data_set_default_sequential_consistency_flag} -- Set default sequential consistency flag
- @table @asis
- @item @emph{Description}:
- Set the default sequential consistency flag. If a non-zero value is passed, a
- sequential data consistency will be enforced for all handles registered after
- this function call, otherwise it is disabled. By default, StarPU enables
- sequential data consistency. It is also possible to select the data consistency
- mode of a specific data handle with the
- @code{starpu_data_set_sequential_consistency_flag} function.
- @item @emph{Prototype}:
- @code{void starpu_data_set_default_sequential_consistency_flag(unsigned flag);}
- @end table
- @node starpu_data_get_default_sequential_consistency_flag
- @subsection @code{starpu_data_get_default_sequential_consistency_flag} -- Get current default sequential consistency flag
- @table @asis
- @item @emph{Description}:
- This function returns the current default sequential consistency flag.
- @item @emph{Prototype}:
- @code{unsigned starpu_data_set_default_sequential_consistency_flag(void);}
- @end table
- @node starpu_data_set_sequential_consistency_flag
- @subsection @code{starpu_data_set_sequential_consistency_flag} -- Set data sequential consistency mode
- @table @asis
- @item @emph{Description}:
- Select the data consistency mode associated to a data handle. The consistency
- mode set using this function has the priority over the default mode which can
- be set with @code{starpu_data_set_sequential_consistency_flag}.
- @item @emph{Prototype}:
- @code{void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag);}
- @end table
- @node Performance Model API
- @section Performance Model API
- @menu
- * starpu_load_history_debug::
- * starpu_perfmodel_debugfilepath::
- * starpu_perfmodel_get_arch_name::
- * starpu_force_bus_sampling::
- @end menu
- @node starpu_load_history_debug
- @subsection @code{starpu_load_history_debug}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);}
- @end table
- @node starpu_perfmodel_debugfilepath
- @subsection @code{starpu_perfmodel_debugfilepath}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model, enum starpu_perf_archtype arch, char *path, size_t maxlen);}
- @end table
- @node starpu_perfmodel_get_arch_name
- @subsection @code{starpu_perfmodel_get_arch_name}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen);}
- @end table
- @node starpu_force_bus_sampling
- @subsection @code{starpu_force_bus_sampling}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_force_bus_sampling(void);}
- @end table
- @node Profiling API
- @section Profiling API
- @menu
- * starpu_profiling_status_set:: starpu_profiling_status_set
- * starpu_profiling_status_get:: starpu_profiling_status_get
- * struct starpu_task_profiling_info:: task profiling information
- * struct starpu_worker_profiling_info:: worker profiling information
- * starpu_worker_get_profiling_info:: starpu_worker_get_profiling_info
- * struct starpu_bus_profiling_info:: bus profiling information
- * starpu_bus_get_count::
- * starpu_bus_get_id::
- * starpu_bus_get_src::
- * starpu_bus_get_dst::
- * starpu_timing_timespec_delay_us::
- * starpu_timing_timespec_to_us::
- * starpu_bus_profiling_helper_display_summary::
- @end menu
- @node starpu_profiling_status_set
- @subsection @code{starpu_profiling_status_set} -- Set current profiling status
- @table @asis
- @item @emph{Description}:
- Thie function sets the profiling status. Profiling is activated by passing
- @code{STARPU_PROFILING_ENABLE} in @code{status}. Passing
- @code{STARPU_PROFILING_DISABLE} disables profiling. Calling this function
- resets all profiling measurements. When profiling is enabled, the
- @code{profiling_info} field of the @code{struct starpu_task} structure points
- to a valid @code{struct starpu_task_profiling_info} structure containing
- information about the execution of the task.
- @item @emph{Return value}:
- Negative return values indicate an error, otherwise the previous status is
- returned.
- @item @emph{Prototype}:
- @code{int starpu_profiling_status_set(int status);}
- @end table
- @node starpu_profiling_status_get
- @subsection @code{starpu_profiling_status_get} -- Get current profiling status
- @table @asis
- @item @emph{Description}:
- Return the current profiling status or a negative value in case there was an error.
- @item @emph{Prototype}:
- @code{int starpu_profiling_status_get(void);}
- @end table
- @node struct starpu_task_profiling_info
- @subsection @code{struct starpu_task_profiling_info} -- Task profiling information
- @table @asis
- @item @emph{Description}:
- This structure contains information about the execution of a task. It is
- accessible from the @code{.profiling_info} field of the @code{starpu_task}
- structure if profiling was enabled.
- @item @emph{Fields}:
- @table @asis
- @item @code{submit_time}:
- Date of task submission (relative to the initialization of StarPU).
- @item @code{start_time}:
- Date of task execution beginning (relative to the initialization of StarPU).
- @item @code{end_time}:
- Date of task execution termination (relative to the initialization of StarPU).
- @item @code{workerid}:
- Identifier of the worker which has executed the task.
- @end table
- @end table
- @node struct starpu_worker_profiling_info
- @subsection @code{struct starpu_worker_profiling_info} -- Worker profiling information
- @table @asis
- @item @emph{Description}:
- This structure contains the profiling information associated to a worker.
- @item @emph{Fields}:
- @table @asis
- @item @code{start_time}:
- Starting date for the reported profiling measurements.
- @item @code{total_time}:
- Duration of the profiling measurement interval.
- @item @code{executing_time}:
- Time spent by the worker to execute tasks during the profiling measurement interval.
- @item @code{sleeping_time}:
- Time spent idling by the worker during the profiling measurement interval.
- @item @code{executed_tasks}:
- Number of tasks executed by the worker during the profiling measurement interval.
- @end table
- @end table
- @node starpu_worker_get_profiling_info
- @subsection @code{starpu_worker_get_profiling_info} -- Get worker profiling info
- @table @asis
- @item @emph{Description}:
- Get the profiling info associated to the worker identified by @code{workerid},
- and reset the profiling measurements. If the @code{worker_info} argument is
- NULL, only reset the counters associated to worker @code{workerid}.
- @item @emph{Return value}:
- Upon successful completion, this function returns 0. Otherwise, a negative
- value is returned.
- @item @emph{Prototype}:
- @code{int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);}
- @end table
- @node struct starpu_bus_profiling_info
- @subsection @code{struct starpu_bus_profiling_info} -- Bus profiling information
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Fields}:
- @table @asis
- @item @code{start_time}:
- TODO
- @item @code{total_time}:
- TODO
- @item @code{transferred_bytes}:
- TODO
- @item @code{transfer_count}:
- TODO
- @end table
- @end table
- @node starpu_bus_get_count
- @subsection @code{starpu_bus_get_count}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_bus_get_count(void);}
- @end table
- @node starpu_bus_get_id
- @subsection @code{starpu_bus_get_id}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_bus_get_id(int src, int dst);}
- @end table
- @node starpu_bus_get_src
- @subsection @code{starpu_bus_get_src}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_bus_get_src(int busid);}
- @end table
- @node starpu_bus_get_dst
- @subsection @code{starpu_bus_get_dst}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_bus_get_dst(int busid);}
- @end table
- @node starpu_timing_timespec_delay_us
- @subsection @code{starpu_timing_timespec_delay_us}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);}
- @end table
- @node starpu_timing_timespec_to_us
- @subsection @code{starpu_timing_timespec_to_us}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{double starpu_timing_timespec_to_us(struct timespec *ts);}
- @end table
- @node starpu_bus_profiling_helper_display_summary
- @subsection @code{starpu_bus_profiling_helper_display_summary}
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{void starpu_bus_profiling_helper_display_summary(void);}
- @end table
- @node CUDA extensions
- @section CUDA extensions
- @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
- @menu
- * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
- * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
- * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
- @end menu
- @node starpu_cuda_get_local_stream
- @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
- @table @asis
- @item @emph{Description}:
- StarPU provides a stream for every CUDA device controlled by StarPU. This
- function is only provided for convenience so that programmers can easily use
- asynchronous operations within codelets without having to create a stream by
- hand. Note that the application is not forced to use the stream provided by
- @code{starpu_cuda_get_local_stream} and may also create its own streams.
- Synchronizing with @code{cudaThreadSynchronize()} is allowed, but will reduce
- the likelihood of having all transfers overlapped.
- @item @emph{Prototype}:
- @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
- @end table
- @node starpu_helper_cublas_init
- @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
- @table @asis
- @item @emph{Description}:
- The CUBLAS library must be initialized prior to any CUBLAS call. Calling
- @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
- controlled by StarPU. This call blocks until CUBLAS has been properly
- initialized on every device.
- @item @emph{Prototype}:
- @code{void starpu_helper_cublas_init(void);}
- @end table
- @node starpu_helper_cublas_shutdown
- @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
- @table @asis
- @item @emph{Description}:
- This function synchronously deinitializes the CUBLAS library on every CUDA device.
- @item @emph{Prototype}:
- @code{void starpu_helper_cublas_shutdown(void);}
- @end table
- @node OpenCL extensions
- @section OpenCL extensions
- @menu
- * Enabling OpenCL:: Enabling OpenCL
- * Compiling OpenCL codelets:: Compiling OpenCL codelets
- * Loading OpenCL codelets:: Loading OpenCL codelets
- @end menu
- @node Enabling OpenCL
- @subsection Enabling OpenCL
- On GPU devices which can run both CUDA and OpenCL, CUDA will be
- enabled by default. To enable OpenCL, you need either to disable CUDA
- when configuring StarPU:
- @example
- % ./configure --disable-cuda
- @end example
- or when running applications:
- @example
- % STARPU_NCUDA=0 ./application
- @end example
- OpenCL will automatically be started on any device not yet used by
- CUDA. So on a machine running 4 GPUS, it is therefore possible to
- enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
- so:
- @example
- % STARPU_NCUDA=2 ./application
- @end example
- @node Compiling OpenCL codelets
- @subsection Compiling OpenCL codelets
- Source codes for OpenCL codelets can be stored in a file or in a
- string. StarPU provides functions to build the program executable for
- each available OpenCL device as a @code{cl_program} object. This
- program executable can then be loaded within a specific queue as
- explained in the next section. These are only helpers, Applications
- can also fill a @code{starpu_opencl_program} array by hand for more advanced
- use (e.g. different programs on the different OpenCL devices, for
- relocation purpose for instance).
- @menu
- * starpu_opencl_load_opencl_from_file:: Compiling OpenCL source code
- * starpu_opencl_load_opencl_from_string:: Compiling OpenCL source code
- * starpu_opencl_unload_opencl:: Releasing OpenCL code
- @end menu
- @node starpu_opencl_load_opencl_from_file
- @subsubsection @code{starpu_opencl_load_opencl_from_file} -- Compiling OpenCL source code
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_opencl_load_opencl_from_file(char *source_file_name, struct starpu_opencl_program *opencl_programs);}
- @end table
- @node starpu_opencl_load_opencl_from_string
- @subsubsection @code{starpu_opencl_load_opencl_from_string} -- Compiling OpenCL source code
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_opencl_load_opencl_from_string(char *opencl_program_source, struct starpu_opencl_program *opencl_programs);}
- @end table
- @node starpu_opencl_unload_opencl
- @subsubsection @code{starpu_opencl_unload_opencl} -- Releasing OpenCL code
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);}
- @end table
- @node Loading OpenCL codelets
- @subsection Loading OpenCL codelets
- @menu
- * starpu_opencl_load_kernel:: Loading a kernel
- * starpu_opencl_relase_kernel:: Releasing a kernel
- @end menu
- @node starpu_opencl_load_kernel
- @subsubsection @code{starpu_opencl_load_kernel} -- Loading a kernel
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, char *kernel_name, int devid)
- }
- @end table
- @node starpu_opencl_relase_kernel
- @subsubsection @code{starpu_opencl_release_kernel} -- Releasing a kernel
- @table @asis
- @item @emph{Description}:
- TODO
- @item @emph{Prototype}:
- @code{int starpu_opencl_release_kernel(cl_kernel kernel);}
- @end table
- @node Cell extensions
- @section Cell extensions
- nothing yet.
- @node Miscellaneous helpers
- @section Miscellaneous helpers
- @menu
- * starpu_execute_on_each_worker:: Execute a function on a subset of workers
- @end menu
- @node starpu_execute_on_each_worker
- @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
- @table @asis
- @item @emph{Description}:
- When calling this method, the offloaded function specified by the first argument is
- executed by every StarPU worker that may execute the function.
- The second argument is passed to the offloaded function.
- The last argument specifies on which types of processing units the function
- should be executed. Similarly to the @code{where} field of the
- @code{starpu_codelet} structure, it is possible to specify that the function
- should be executed on every CUDA device and every CPU by passing
- @code{STARPU_CPU|STARPU_CUDA}.
- This function blocks until the function has been executed on every appropriate
- processing units, so that it may not be called from a callback function for
- instance.
- @item @emph{Prototype}:
- @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
- @end table
- @c ---------------------------------------------------------------------
- @c Advanced Topics
- @c ---------------------------------------------------------------------
- @node Advanced Topics
- @chapter Advanced Topics
- @menu
- * Defining a new data interface::
- * Defining a new scheduling policy::
- @end menu
- @node Defining a new data interface
- @section Defining a new data interface
- @menu
- * struct starpu_data_interface_ops_t:: Per-interface methods
- * struct starpu_data_copy_methods:: Per-interface data transfer methods
- * An example of data interface:: An example of data interface
- @end menu
- @c void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node); TODO
- @node struct starpu_data_interface_ops_t
- @subsection @code{struct starpu_data_interface_ops_t} -- Per-interface methods
- @table @asis
- @item @emph{Description}:
- TODO describe all the different fields
- @end table
- @node struct starpu_data_copy_methods
- @subsection @code{struct starpu_data_copy_methods} -- Per-interface data transfer methods
- @table @asis
- @item @emph{Description}:
- TODO describe all the different fields
- @end table
- @node An example of data interface
- @subsection An example of data interface
- @table @asis
- TODO
- @end table
- @node Defining a new scheduling policy
- @section Defining a new scheduling policy
- TODO
- A full example showing how to define a new scheduling policy is available in
- the StarPU sources in the directory @code{examples/scheduler/}.
- @menu
- * struct starpu_sched_policy_s::
- * starpu_worker_set_sched_condition::
- * starpu_sched_set_min_priority:: Set the minimum priority level
- * starpu_sched_set_max_priority:: Set the maximum priority level
- * Source code::
- @end menu
- @node struct starpu_sched_policy_s
- @subsection @code{struct starpu_sched_policy_s} -- Scheduler methods
- @table @asis
- @item @emph{Description}:
- This structure contains all the methods that implement a scheduling policy. An
- application may specify which scheduling strategy in the @code{sched_policy}
- field of the @code{starpu_conf} structure passed to the @code{starpu_init}
- function.
- @item @emph{Fields}:
- @table @asis
- @item @code{init_sched}:
- Initialize the scheduling policy.
- @item @code{deinit_sched}:
- Cleanup the scheduling policy.
- @item @code{push_task}:
- Insert a task into the scheduler.
- @item @code{push_prio_task}:
- Insert a priority task into the scheduler.
- @item @code{pop_task}:
- Get a task from the scheduler. The mutex associated to the worker is already
- taken when this method is called.
- @item @code{pop_every_task}:
- Remove all available tasks from the scheduler (tasks are chained by the means
- of the prev and next fields of the starpu_task structure). The mutex associated
- to the worker is already taken when this method is called.
- @item @code{post_exec_hook} (optionnal):
- This method is called every time a task has been executed.
- @item @code{policy_name}:
- Name of the policy (optionnal).
- @item @code{policy_description}:
- Description of the policy (optionnal).
- @end table
- @end table
- @node starpu_worker_set_sched_condition
- @subsection @code{starpu_worker_set_sched_condition} -- Specify the condition variable associated to a worker
- @table @asis
- @item @emph{Description}:
- When there is no available task for a worker, StarPU blocks this worker on a
- condition variable. This function specifies which condition variable (and the
- associated mutex) should be used to block (and to wake up) a worker. Note that
- multiple workers may use the same condition variable. For instance, in the case
- of a scheduling strategy with a single task queue, the same condition variable
- would be used to block and wake up all workers.
- The initialization method of a scheduling strategy (@code{init_sched}) must
- call this function once per worker.
- @item @emph{Prototype}:
- @code{void starpu_worker_set_sched_condition(int workerid, pthread_cond_t *sched_cond, pthread_mutex_t *sched_mutex);}
- @end table
- @node starpu_sched_set_min_priority
- @subsection @code{starpu_sched_set_min_priority}
- @table @asis
- @item @emph{Description}:
- Defines the minimum priority level supported by the scheduling policy. The
- default minimum priority level is the same as the default priority level which
- is 0 by convention. The application may access that value by calling the
- @code{starpu_sched_get_min_priority} function. This function should only be
- called from the initialization method of the scheduling policy, and should not
- be used directly from the application.
- @item @emph{Prototype}:
- @code{void starpu_sched_set_min_priority(int min_prio)}
- @end table
- @node starpu_sched_set_max_priority
- @subsection @code{starpu_sched_set_max_priority}
- @table @asis
- @item @emph{Description}:
- Defines the maximum priority level supported by the scheduling policy. The
- default maximum priority level is 1. The application may access that value by
- calling the @code{starpu_sched_get_max_priority} function. This function should
- only be called from the initialization method of the scheduling policy, and
- should not be used directly from the application.
- @item @emph{Prototype}:
- @code{void starpu_sched_set_min_priority(int max_prio)}
- @end table
- @node Source code
- @subsection Source code
- @cartouche
- @smallexample
- static struct starpu_sched_policy_s dummy_sched_policy = @{
- .init_sched = init_dummy_sched,
- .deinit_sched = deinit_dummy_sched,
- .push_task = push_task_dummy,
- .push_prio_task = NULL,
- .pop_task = pop_task_dummy,
- .post_exec_hook = NULL,
- .pop_every_task = NULL,
- .policy_name = "dummy",
- .policy_description = "dummy scheduling strategy"
- @};
- @end smallexample
- @end cartouche
- @c ---------------------------------------------------------------------
- @c Appendices
- @c ---------------------------------------------------------------------
- @c ---------------------------------------------------------------------
- @c Full source code for the 'Scaling a Vector' example
- @c ---------------------------------------------------------------------
- @node Full source code for the 'Scaling a Vector' example
- @appendix Full source code for the 'Scaling a Vector' example
- @menu
- * Main application::
- * CPU Codelet::
- * CUDA Codelet::
- * OpenCL Codelet::
- @end menu
- @node Main application
- @section Main application
- @smallexample
- @include vector_scal_c.texi
- @end smallexample
- @node CPU Codelet
- @section CPU Codelet
- @smallexample
- @include vector_scal_cpu.texi
- @end smallexample
- @node CUDA Codelet
- @section CUDA Codelet
- @smallexample
- @include vector_scal_cuda.texi
- @end smallexample
- @node OpenCL Codelet
- @section OpenCL Codelet
- @menu
- * Invoking the kernel::
- * Source of the kernel::
- @end menu
- @node Invoking the kernel
- @subsection Invoking the kernel
- @smallexample
- @include vector_scal_opencl.texi
- @end smallexample
- @node Source of the kernel
- @subsection Source of the kernel
- @smallexample
- @include vector_scal_opencl_codelet.texi
- @end smallexample
- @bye
|