Browse Source

merge examples+gcc-plugin

Andra Hugo 13 years ago
parent
commit
94a438324b
100 changed files with 12412 additions and 8727 deletions
  1. 451 0
      doc/COPYING.GFDL
  2. 37 18
      doc/Makefile.am
  3. 0 160
      doc/c-extensions.texi
  4. 701 0
      doc/chapters/advanced-api.texi
  5. 886 0
      doc/chapters/advanced-examples.texi
  6. 2273 0
      doc/chapters/basic-api.texi
  7. 989 0
      doc/chapters/basic-examples.texi
  8. 411 0
      doc/chapters/c-extensions.texi
  9. 403 0
      doc/chapters/configuration.texi
  10. 508 0
      doc/chapters/fdl-1.3.texi
  11. 107 0
      doc/chapters/fft-support.texi
  12. 130 0
      doc/chapters/installing.texi
  13. 186 0
      doc/chapters/introduction.texi
  14. 432 0
      doc/chapters/mpi-support.texi
  15. 429 0
      doc/chapters/perf-feedback.texi
  16. 331 0
      doc/chapters/perf-optimization.texi
  17. 48 0
      doc/chapters/scaling-vector-example.texi
  18. 25 0
      doc/chapters/socl.texi
  19. 79 0
      doc/chapters/tips-tricks.texi
  20. 113 0
      doc/chapters/using.texi
  21. 20 10
      doc/vector_scal_c.texi
  22. 68 0
      doc/chapters/vector_scal_cpu.texi
  23. 9 0
      doc/vector_scal_cuda.texi
  24. 10 1
      doc/vector_scal_opencl.texi
  25. 16 0
      doc/chapters/vector_scal_opencl_codelet.texi
  26. 12 0
      doc/starpu.css
  27. 135 4947
      doc/starpu.texi
  28. 29 15
      doc/tutorial/Makefile
  29. 21 8
      doc/tutorial/README
  30. 23 10
      doc/tutorial/hello_world.c
  31. 31 18
      doc/tutorial/vector_scal.c
  32. 26 13
      doc/tutorial/vector_scal_cpu.c
  33. 21 8
      doc/tutorial/vector_scal_cuda.cu
  34. 22 9
      doc/tutorial/vector_scal_opencl.c
  35. 21 8
      doc/tutorial/vector_scal_opencl_kernel.cl
  36. 0 32
      doc/vector_scal_cpu.texi
  37. 0 7
      doc/vector_scal_opencl_codelet.texi
  38. 118 23
      examples/Makefile.am
  39. 16 0
      examples/audio/Makefile
  40. 39 22
      examples/audio/starpu_audio_processing.c
  41. 23 17
      examples/axpy/axpy.c
  42. 32 21
      examples/basic_examples/block.c
  43. 5 3
      examples/basic_examples/block_cpu.c
  44. 5 3
      examples/basic_examples/block_cuda.cu
  45. 18 11
      examples/basic_examples/block_opencl.c
  46. 5 3
      examples/basic_examples/block_opencl_kernel.cl
  47. 20 11
      examples/basic_examples/hello_world.c
  48. 49 26
      examples/basic_examples/mult.c
  49. 0 384
      examples/basic_examples/mult_impl.c
  50. 329 0
      examples/basic_examples/multiformat.c
  51. 81 0
      examples/basic_examples/multiformat_conversion_codelets.c
  52. 50 0
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  53. 99 0
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  54. 14 4
      examples/starpufft/cudaf_kernels.cu
  55. 43 0
      examples/basic_examples/multiformat_cuda.cu
  56. 92 0
      examples/basic_examples/multiformat_opencl.c
  57. 8 5
      examples/starpufft/starpufft_common.c
  58. 33 0
      examples/basic_examples/multiformat_types.h
  59. 18 10
      examples/basic_examples/variable.c
  60. 0 1
      examples/basic_examples/variable_kernels_opencl.c
  61. 57 24
      examples/basic_examples/vector_scal.c
  62. 18 13
      examples/basic_examples/vector_scal_c.c
  63. 35 5
      examples/basic_examples/vector_scal_cpu.c
  64. 1 0
      examples/basic_examples/vector_scal_cpu_icc.icc
  65. 1 1
      examples/basic_examples/vector_scal_opencl.c
  66. 3 2
      examples/basic_examples/vector_scal_opencl_kernel.cl
  67. 25 11
      examples/callback/callback.c
  68. 84 23
      examples/cg/cg.c
  69. 15 15
      examples/cg/cg.h
  70. 127 139
      examples/cg/cg_kernels.c
  71. 45 26
      examples/cholesky/cholesky.h
  72. 100 60
      examples/cholesky/cholesky_grain_tag.c
  73. 79 46
      examples/cholesky/cholesky_implicit.c
  74. 9 5
      examples/cholesky/cholesky_kernels.c
  75. 34 27
      examples/cholesky/cholesky_models.c
  76. 96 57
      examples/cholesky/cholesky_tag.c
  77. 75 54
      examples/cholesky/cholesky_tile_tag.c
  78. 0 0
      examples/cholesky_2ctxs/cholesky/.dirstamp
  79. 0 154
      examples/cholesky_2ctxs/cholesky/cholesky.h
  80. 0 382
      examples/cholesky_2ctxs/cholesky/cholesky_grain_tag.c
  81. 0 286
      examples/cholesky_2ctxs/cholesky/cholesky_implicit.c
  82. 0 280
      examples/cholesky_2ctxs/cholesky/cholesky_implicit_all_machine.c
  83. 0 230
      examples/cholesky_2ctxs/cholesky/cholesky_kernels.c
  84. 0 153
      examples/cholesky_2ctxs/cholesky/cholesky_models.c
  85. 0 370
      examples/cholesky_2ctxs/cholesky/cholesky_tag.c
  86. 0 307
      examples/cholesky_2ctxs/cholesky/cholesky_tile_tag.c
  87. 0 231
      examples/cholesky_2ctxs/cholesky_2ctxs.c
  88. 5 5
      examples/common/blas_model.c
  89. 13 9
      examples/common/blas_model.h
  90. 101 0
      examples/cpp/incrementer_cpp.cpp
  91. 51 0
      examples/filters/custom_mf/conversion.cu
  92. 102 0
      examples/filters/custom_mf/conversion_opencl.c
  93. 17 4
      examples/starpufft/starpufft.c
  94. 45 0
      examples/filters/custom_mf/cuda.cu
  95. 95 0
      examples/filters/custom_mf/custom_conversion_codelets.c
  96. 599 0
      examples/filters/custom_mf/custom_interface.c
  97. 48 0
      examples/filters/custom_mf/custom_interface.h
  98. 331 0
      examples/filters/custom_mf/custom_mf_filter.c
  99. 101 0
      examples/filters/custom_mf/custom_opencl.c
  100. 0 0
      examples/starpufft/starpufftf.c

+ 451 - 0
doc/COPYING.GFDL

@@ -0,0 +1,451 @@
+
+                GNU Free Documentation License
+                 Version 1.3, 3 November 2008
+
+
+ Copyright (C) 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
+     <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+0. PREAMBLE
+
+The purpose of this License is to make a manual, textbook, or other
+functional and useful document "free" in the sense of freedom: to
+assure everyone the effective freedom to copy and redistribute it,
+with or without modifying it, either commercially or noncommercially.
+Secondarily, this License preserves for the author and publisher a way
+to get credit for their work, while not being considered responsible
+for modifications made by others.
+
+This License is a kind of "copyleft", which means that derivative
+works of the document must themselves be free in the same sense.  It
+complements the GNU General Public License, which is a copyleft
+license designed for free software.
+
+We have designed this License in order to use it for manuals for free
+software, because free software needs free documentation: a free
+program should come with manuals providing the same freedoms that the
+software does.  But this License is not limited to software manuals;
+it can be used for any textual work, regardless of subject matter or
+whether it is published as a printed book.  We recommend this License
+principally for works whose purpose is instruction or reference.
+
+
+1. APPLICABILITY AND DEFINITIONS
+
+This License applies to any manual or other work, in any medium, that
+contains a notice placed by the copyright holder saying it can be
+distributed under the terms of this License.  Such a notice grants a
+world-wide, royalty-free license, unlimited in duration, to use that
+work under the conditions stated herein.  The "Document", below,
+refers to any such manual or work.  Any member of the public is a
+licensee, and is addressed as "you".  You accept the license if you
+copy, modify or distribute the work in a way requiring permission
+under copyright law.
+
+A "Modified Version" of the Document means any work containing the
+Document or a portion of it, either copied verbatim, or with
+modifications and/or translated into another language.
+
+A "Secondary Section" is a named appendix or a front-matter section of
+the Document that deals exclusively with the relationship of the
+publishers or authors of the Document to the Document's overall
+subject (or to related matters) and contains nothing that could fall
+directly within that overall subject.  (Thus, if the Document is in
+part a textbook of mathematics, a Secondary Section may not explain
+any mathematics.)  The relationship could be a matter of historical
+connection with the subject or with related matters, or of legal,
+commercial, philosophical, ethical or political position regarding
+them.
+
+The "Invariant Sections" are certain Secondary Sections whose titles
+are designated, as being those of Invariant Sections, in the notice
+that says that the Document is released under this License.  If a
+section does not fit the above definition of Secondary then it is not
+allowed to be designated as Invariant.  The Document may contain zero
+Invariant Sections.  If the Document does not identify any Invariant
+Sections then there are none.
+
+The "Cover Texts" are certain short passages of text that are listed,
+as Front-Cover Texts or Back-Cover Texts, in the notice that says that
+the Document is released under this License.  A Front-Cover Text may
+be at most 5 words, and a Back-Cover Text may be at most 25 words.
+
+A "Transparent" copy of the Document means a machine-readable copy,
+represented in a format whose specification is available to the
+general public, that is suitable for revising the document
+straightforwardly with generic text editors or (for images composed of
+pixels) generic paint programs or (for drawings) some widely available
+drawing editor, and that is suitable for input to text formatters or
+for automatic translation to a variety of formats suitable for input
+to text formatters.  A copy made in an otherwise Transparent file
+format whose markup, or absence of markup, has been arranged to thwart
+or discourage subsequent modification by readers is not Transparent.
+An image format is not Transparent if used for any substantial amount
+of text.  A copy that is not "Transparent" is called "Opaque".
+
+Examples of suitable formats for Transparent copies include plain
+ASCII without markup, Texinfo input format, LaTeX input format, SGML
+or XML using a publicly available DTD, and standard-conforming simple
+HTML, PostScript or PDF designed for human modification.  Examples of
+transparent image formats include PNG, XCF and JPG.  Opaque formats
+include proprietary formats that can be read and edited only by
+proprietary word processors, SGML or XML for which the DTD and/or
+processing tools are not generally available, and the
+machine-generated HTML, PostScript or PDF produced by some word
+processors for output purposes only.
+
+The "Title Page" means, for a printed book, the title page itself,
+plus such following pages as are needed to hold, legibly, the material
+this License requires to appear in the title page.  For works in
+formats which do not have any title page as such, "Title Page" means
+the text near the most prominent appearance of the work's title,
+preceding the beginning of the body of the text.
+
+The "publisher" means any person or entity that distributes copies of
+the Document to the public.
+
+A section "Entitled XYZ" means a named subunit of the Document whose
+title either is precisely XYZ or contains XYZ in parentheses following
+text that translates XYZ in another language.  (Here XYZ stands for a
+specific section name mentioned below, such as "Acknowledgements",
+"Dedications", "Endorsements", or "History".)  To "Preserve the Title"
+of such a section when you modify the Document means that it remains a
+section "Entitled XYZ" according to this definition.
+
+The Document may include Warranty Disclaimers next to the notice which
+states that this License applies to the Document.  These Warranty
+Disclaimers are considered to be included by reference in this
+License, but only as regards disclaiming warranties: any other
+implication that these Warranty Disclaimers may have is void and has
+no effect on the meaning of this License.
+
+2. VERBATIM COPYING
+
+You may copy and distribute the Document in any medium, either
+commercially or noncommercially, provided that this License, the
+copyright notices, and the license notice saying this License applies
+to the Document are reproduced in all copies, and that you add no
+other conditions whatsoever to those of this License.  You may not use
+technical measures to obstruct or control the reading or further
+copying of the copies you make or distribute.  However, you may accept
+compensation in exchange for copies.  If you distribute a large enough
+number of copies you must also follow the conditions in section 3.
+
+You may also lend copies, under the same conditions stated above, and
+you may publicly display copies.
+
+
+3. COPYING IN QUANTITY
+
+If you publish printed copies (or copies in media that commonly have
+printed covers) of the Document, numbering more than 100, and the
+Document's license notice requires Cover Texts, you must enclose the
+copies in covers that carry, clearly and legibly, all these Cover
+Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
+the back cover.  Both covers must also clearly and legibly identify
+you as the publisher of these copies.  The front cover must present
+the full title with all words of the title equally prominent and
+visible.  You may add other material on the covers in addition.
+Copying with changes limited to the covers, as long as they preserve
+the title of the Document and satisfy these conditions, can be treated
+as verbatim copying in other respects.
+
+If the required texts for either cover are too voluminous to fit
+legibly, you should put the first ones listed (as many as fit
+reasonably) on the actual cover, and continue the rest onto adjacent
+pages.
+
+If you publish or distribute Opaque copies of the Document numbering
+more than 100, you must either include a machine-readable Transparent
+copy along with each Opaque copy, or state in or with each Opaque copy
+a computer-network location from which the general network-using
+public has access to download using public-standard network protocols
+a complete Transparent copy of the Document, free of added material.
+If you use the latter option, you must take reasonably prudent steps,
+when you begin distribution of Opaque copies in quantity, to ensure
+that this Transparent copy will remain thus accessible at the stated
+location until at least one year after the last time you distribute an
+Opaque copy (directly or through your agents or retailers) of that
+edition to the public.
+
+It is requested, but not required, that you contact the authors of the
+Document well before redistributing any large number of copies, to
+give them a chance to provide you with an updated version of the
+Document.
+
+
+4. MODIFICATIONS
+
+You may copy and distribute a Modified Version of the Document under
+the conditions of sections 2 and 3 above, provided that you release
+the Modified Version under precisely this License, with the Modified
+Version filling the role of the Document, thus licensing distribution
+and modification of the Modified Version to whoever possesses a copy
+of it.  In addition, you must do these things in the Modified Version:
+
+A. Use in the Title Page (and on the covers, if any) a title distinct
+   from that of the Document, and from those of previous versions
+   (which should, if there were any, be listed in the History section
+   of the Document).  You may use the same title as a previous version
+   if the original publisher of that version gives permission.
+B. List on the Title Page, as authors, one or more persons or entities
+   responsible for authorship of the modifications in the Modified
+   Version, together with at least five of the principal authors of the
+   Document (all of its principal authors, if it has fewer than five),
+   unless they release you from this requirement.
+C. State on the Title page the name of the publisher of the
+   Modified Version, as the publisher.
+D. Preserve all the copyright notices of the Document.
+E. Add an appropriate copyright notice for your modifications
+   adjacent to the other copyright notices.
+F. Include, immediately after the copyright notices, a license notice
+   giving the public permission to use the Modified Version under the
+   terms of this License, in the form shown in the Addendum below.
+G. Preserve in that license notice the full lists of Invariant Sections
+   and required Cover Texts given in the Document's license notice.
+H. Include an unaltered copy of this License.
+I. Preserve the section Entitled "History", Preserve its Title, and add
+   to it an item stating at least the title, year, new authors, and
+   publisher of the Modified Version as given on the Title Page.  If
+   there is no section Entitled "History" in the Document, create one
+   stating the title, year, authors, and publisher of the Document as
+   given on its Title Page, then add an item describing the Modified
+   Version as stated in the previous sentence.
+J. Preserve the network location, if any, given in the Document for
+   public access to a Transparent copy of the Document, and likewise
+   the network locations given in the Document for previous versions
+   it was based on.  These may be placed in the "History" section.
+   You may omit a network location for a work that was published at
+   least four years before the Document itself, or if the original
+   publisher of the version it refers to gives permission.
+K. For any section Entitled "Acknowledgements" or "Dedications",
+   Preserve the Title of the section, and preserve in the section all
+   the substance and tone of each of the contributor acknowledgements
+   and/or dedications given therein.
+L. Preserve all the Invariant Sections of the Document,
+   unaltered in their text and in their titles.  Section numbers
+   or the equivalent are not considered part of the section titles.
+M. Delete any section Entitled "Endorsements".  Such a section
+   may not be included in the Modified Version.
+N. Do not retitle any existing section to be Entitled "Endorsements"
+   or to conflict in title with any Invariant Section.
+O. Preserve any Warranty Disclaimers.
+
+If the Modified Version includes new front-matter sections or
+appendices that qualify as Secondary Sections and contain no material
+copied from the Document, you may at your option designate some or all
+of these sections as invariant.  To do this, add their titles to the
+list of Invariant Sections in the Modified Version's license notice.
+These titles must be distinct from any other section titles.
+
+You may add a section Entitled "Endorsements", provided it contains
+nothing but endorsements of your Modified Version by various
+parties--for example, statements of peer review or that the text has
+been approved by an organization as the authoritative definition of a
+standard.
+
+You may add a passage of up to five words as a Front-Cover Text, and a
+passage of up to 25 words as a Back-Cover Text, to the end of the list
+of Cover Texts in the Modified Version.  Only one passage of
+Front-Cover Text and one of Back-Cover Text may be added by (or
+through arrangements made by) any one entity.  If the Document already
+includes a cover text for the same cover, previously added by you or
+by arrangement made by the same entity you are acting on behalf of,
+you may not add another; but you may replace the old one, on explicit
+permission from the previous publisher that added the old one.
+
+The author(s) and publisher(s) of the Document do not by this License
+give permission to use their names for publicity for or to assert or
+imply endorsement of any Modified Version.
+
+
+5. COMBINING DOCUMENTS
+
+You may combine the Document with other documents released under this
+License, under the terms defined in section 4 above for modified
+versions, provided that you include in the combination all of the
+Invariant Sections of all of the original documents, unmodified, and
+list them all as Invariant Sections of your combined work in its
+license notice, and that you preserve all their Warranty Disclaimers.
+
+The combined work need only contain one copy of this License, and
+multiple identical Invariant Sections may be replaced with a single
+copy.  If there are multiple Invariant Sections with the same name but
+different contents, make the title of each such section unique by
+adding at the end of it, in parentheses, the name of the original
+author or publisher of that section if known, or else a unique number.
+Make the same adjustment to the section titles in the list of
+Invariant Sections in the license notice of the combined work.
+
+In the combination, you must combine any sections Entitled "History"
+in the various original documents, forming one section Entitled
+"History"; likewise combine any sections Entitled "Acknowledgements",
+and any sections Entitled "Dedications".  You must delete all sections
+Entitled "Endorsements".
+
+
+6. COLLECTIONS OF DOCUMENTS
+
+You may make a collection consisting of the Document and other
+documents released under this License, and replace the individual
+copies of this License in the various documents with a single copy
+that is included in the collection, provided that you follow the rules
+of this License for verbatim copying of each of the documents in all
+other respects.
+
+You may extract a single document from such a collection, and
+distribute it individually under this License, provided you insert a
+copy of this License into the extracted document, and follow this
+License in all other respects regarding verbatim copying of that
+document.
+
+
+7. AGGREGATION WITH INDEPENDENT WORKS
+
+A compilation of the Document or its derivatives with other separate
+and independent documents or works, in or on a volume of a storage or
+distribution medium, is called an "aggregate" if the copyright
+resulting from the compilation is not used to limit the legal rights
+of the compilation's users beyond what the individual works permit.
+When the Document is included in an aggregate, this License does not
+apply to the other works in the aggregate which are not themselves
+derivative works of the Document.
+
+If the Cover Text requirement of section 3 is applicable to these
+copies of the Document, then if the Document is less than one half of
+the entire aggregate, the Document's Cover Texts may be placed on
+covers that bracket the Document within the aggregate, or the
+electronic equivalent of covers if the Document is in electronic form.
+Otherwise they must appear on printed covers that bracket the whole
+aggregate.
+
+
+8. TRANSLATION
+
+Translation is considered a kind of modification, so you may
+distribute translations of the Document under the terms of section 4.
+Replacing Invariant Sections with translations requires special
+permission from their copyright holders, but you may include
+translations of some or all Invariant Sections in addition to the
+original versions of these Invariant Sections.  You may include a
+translation of this License, and all the license notices in the
+Document, and any Warranty Disclaimers, provided that you also include
+the original English version of this License and the original versions
+of those notices and disclaimers.  In case of a disagreement between
+the translation and the original version of this License or a notice
+or disclaimer, the original version will prevail.
+
+If a section in the Document is Entitled "Acknowledgements",
+"Dedications", or "History", the requirement (section 4) to Preserve
+its Title (section 1) will typically require changing the actual
+title.
+
+
+9. TERMINATION
+
+You may not copy, modify, sublicense, or distribute the Document
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense, or distribute it is void, and
+will automatically terminate your rights under this License.
+
+However, if you cease all violation of this License, then your license
+from a particular copyright holder is reinstated (a) provisionally,
+unless and until the copyright holder explicitly and finally
+terminates your license, and (b) permanently, if the copyright holder
+fails to notify you of the violation by some reasonable means prior to
+60 days after the cessation.
+
+Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, receipt of a copy of some or all of the same material does
+not give you any rights to use it.
+
+
+10. FUTURE REVISIONS OF THIS LICENSE
+
+The Free Software Foundation may publish new, revised versions of the
+GNU Free Documentation License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in
+detail to address new problems or concerns.  See
+http://www.gnu.org/copyleft/.
+
+Each version of the License is given a distinguishing version number.
+If the Document specifies that a particular numbered version of this
+License "or any later version" applies to it, you have the option of
+following the terms and conditions either of that specified version or
+of any later version that has been published (not as a draft) by the
+Free Software Foundation.  If the Document does not specify a version
+number of this License, you may choose any version ever published (not
+as a draft) by the Free Software Foundation.  If the Document
+specifies that a proxy can decide which future versions of this
+License can be used, that proxy's public statement of acceptance of a
+version permanently authorizes you to choose that version for the
+Document.
+
+11. RELICENSING
+
+"Massive Multiauthor Collaboration Site" (or "MMC Site") means any
+World Wide Web server that publishes copyrightable works and also
+provides prominent facilities for anybody to edit those works.  A
+public wiki that anybody can edit is an example of such a server.  A
+"Massive Multiauthor Collaboration" (or "MMC") contained in the site
+means any set of copyrightable works thus published on the MMC site.
+
+"CC-BY-SA" means the Creative Commons Attribution-Share Alike 3.0 
+license published by Creative Commons Corporation, a not-for-profit 
+corporation with a principal place of business in San Francisco, 
+California, as well as future copyleft versions of that license 
+published by that same organization.
+
+"Incorporate" means to publish or republish a Document, in whole or in 
+part, as part of another Document.
+
+An MMC is "eligible for relicensing" if it is licensed under this 
+License, and if all works that were first published under this License 
+somewhere other than this MMC, and subsequently incorporated in whole or 
+in part into the MMC, (1) had no cover texts or invariant sections, and 
+(2) were thus incorporated prior to November 1, 2008.
+
+The operator of an MMC Site may republish an MMC contained in the site
+under CC-BY-SA on the same site at any time before August 1, 2009,
+provided the MMC is eligible for relicensing.
+
+
+ADDENDUM: How to use this License for your documents
+
+To use this License in a document you have written, include a copy of
+the License in the document and put the following copyright and
+license notices just after the title page:
+
+    Copyright (c)  YEAR  YOUR NAME.
+    Permission is granted to copy, distribute and/or modify this document
+    under the terms of the GNU Free Documentation License, Version 1.3
+    or any later version published by the Free Software Foundation;
+    with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+    A copy of the license is included in the section entitled "GNU
+    Free Documentation License".
+
+If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
+replace the "with...Texts." line with this:
+
+    with the Invariant Sections being LIST THEIR TITLES, with the
+    Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST.
+
+If you have Invariant Sections without Cover Texts, or some other
+combination of the three, merge those two alternatives to suit the
+situation.
+
+If your document contains nontrivial examples of program code, we
+recommend releasing these examples in parallel under your choice of
+free software license, such as the GNU General Public License,
+to permit their use in free software.

+ 37 - 18
doc/Makefile.am

@@ -1,32 +1,44 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009, 2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
+# Permission is granted to copy, distribute and/or modify this document
+# under the terms of the GNU Free Documentation License, Version 1.3
+# or any later version published by the Free Software Foundation;
+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
 #
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+# See the GNU Free Documentation License in COPYING.GFDL for more details.
 
 info_TEXINFOS = starpu.texi
 
-starpu_TEXINFOS = c-extensions.texi
+starpu_TEXINFOS = chapters/advanced-api.texi \
+	chapters/configuration.texi \
+	chapters/perf-feedback.texi \
+	chapters/vector_scal_cpu.texi \
+	chapters/advanced-examples.texi \
+	chapters/fdl-1.3.texi \
+	chapters/perf-optimization.texi \
+	chapters/vector_scal_c.texi \
+	chapters/basic-api.texi \
+	chapters/installing.texi \
+	chapters/scaling-vector-example.texi \
+	chapters/vector_scal_cuda.texi \
+	chapters/basic-examples.texi \
+	chapters/introduction.texi \
+	chapters/tips-tricks.texi \
+	chapters/vector_scal_opencl_codelet.texi \
+	chapters/c-extensions.texi \
+	chapters/mpi-support.texi \
+	chapters/fft-support.texi \
+	chapters/using.texi \
+	chapters/vector_scal_opencl.texi \
+	chapters/socl.texi
 
 MAINTAINERCLEANFILES = starpu.pdf
 
 EXTRA_DIST = starpu.pdf \
-	starpu.css \
-	vector_scal_c.texi \
-	vector_scal_cpu.texi \
-	vector_scal_cuda.texi \
-	vector_scal_opencl_codelet.texi \
-	vector_scal_opencl.texi
+	starpu.css
 
 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
 
@@ -45,3 +57,10 @@ uninstall-local:
 #
 #CLEANFILES= \
 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
+
+# Rule to update documentation on web server. Should only be used locally.
+update-web:
+	scp starpu.pdf starpu.html sync:/web/runtime/html/StarPU
+
+showcheck:
+	-cat /dev/null

+ 0 - 160
doc/c-extensions.texi

@@ -1,160 +0,0 @@
-@c This is part of the StarPU Handbook.
-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
-
-@node C Extensions
-@chapter C Extensions
-
-@cindex C extensions
-@cindex GCC plug-in
-
-When configured with @code{--enable-gcc-extensions}, StarPU builds a
-plug-in for the GNU Compiler Collection (GCC), which defines extensions
-to the C language that make it easier to write StarPU code@footnote{This
-feature is only available for GCC 4.5 and later.}.  Those extensions
-include syntactic sugar for defining tasks and their implementations,
-invoking a task, and manipulating data buffers.
-
-This section does not require detailed knowledge of the StarPU library.
-
-Note: as of StarPU @value{VERSION}, this is still an area under
-development and subject to change.
-
-@menu
-* Defining Tasks::              Defining StarPU tasks
-* Registered Data Buffers::     Manipulating data buffers
-@end menu
-
-@node Defining Tasks
-@section Defining Tasks
-
-@cindex task
-@cindex task implementation
-
-The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
-
-@enumerate
-@item
-tasks may have several implementations---e.g., one for CPUs, one written
-in OpenCL, one written in CUDA;
-@item
-when a task is invoked, it may run in parallel, and StarPU is free to
-choose any of its implementations.
-@end enumerate
-
-Tasks and their implementations must be @emph{declared}.  These
-declarations are annotated with @dfn{attributes} (@pxref{Attribute
-Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
-(GCC)}): the declaration of a task is a regular C function declaration
-with an additional @code{task} attribute, and task implementations are
-declared with a @code{task_implementation} attribute.
-
-The following function attributes are provided:
-
-@table @code
-
-@item task
-@cindex @code{task} attribute
-Declare the given function as a StarPU task.  Its return type must be
-@code{void}, and it must not be defined---instead, a definition will
-automatically be provided by the compiler.
-
-Under the hood, declaring a task leads to the declaration of the
-corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
-more task implementations are declared in the same compilation unit,
-then the codelet and the function itself are also defined; they inherit
-the scope of the task.
-
-Scalar arguments to the task are passed by value and copied to the
-target device if need be---technically, they are passed as the
-@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
-
-Pointer arguments are assumed to be registered data buffers---the
-@code{buffers} argument of a task (@pxref{Codelets and Tasks,
-@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
-read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
-buffers are assumed to be used read-write (@code{STARPU_RW}).
-
-@item task_implementation (@var{target}, @var{task})
-@cindex @code{task_implementation} attribute
-Declare the given function as an implementation of @var{task} to run on
-@var{target}.  @var{target} must be a string, currently one of
-@code{"cpu"} or @code{"cuda"}.
-@c FIXME: Update when OpenCL support is ready.
-
-@end table
-
-Here is an example:
-
-@example
-static void matmul (const float *A, const float *B, float *C,
-		    size_t nx, size_t ny, size_t nz)
-  __attribute__ ((task));
-
-static void matmul_cpu (const float *A, const float *B, float *C,
-			size_t nx, size_t ny, size_t nz)
-  __attribute__ ((task_implementation ("cpu", matmul)));
-
-
-static void
-matmul_cpu (const float *A, const float *B, float *C,
-	    size_t nx, size_t ny, size_t nz)
-@{
-  size_t i, j, k;
-
-  for (j = 0; j < ny; j++)
-    for (i = 0; i < nx; i++)
-      @{
-	for (k = 0; k < nz; k++)
-	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
-      @}
-@}
-@end example
-
-@noindent
-A @code{matmult} task is defined; it has only one implementation,
-@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
-@var{B} are input buffers, whereas @var{C} is considered an input/output
-buffer.  The task can be invoked like a regular C function:
-
-@example
-matmul (&A[i * zdim * bydim + k * bzdim * bydim],
-        &B[k * xdim * bzdim + j * bxdim * bzdim],
-        &C[i * xdim * bydim + j * bxdim * bydim],
-        bxdim, bydim, bzdim);
-@end example
-
-@noindent
-This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
-implementation may run in parallel with the continuation of the caller.
-
-The next section describes how memory buffers must be handled in
-StarPU-GCC code.
-
-
-@node Registered Data Buffers
-@section Registered Data Buffers
-
-Data buffers such as matrices and vectors that are to be passed to tasks
-must be @dfn{registered}.  Registration allows StarPU to handle data
-transfers among devices---e.g., transferring an input buffer from the
-CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
-Management Library}).
-
-The following pragmas are provided:
-
-@table @code
-
-@item #pragma starpu register @var{ptr} [@var{size}]
-Register @var{ptr} as a @var{size}-element buffer.
-
-@item #pragma starpu unregister @var{ptr}
-@item #pragma starpu acquire @var{ptr}
-
-@end table
-
-FIXME: finish
-
-@c Local Variables:
-@c TeX-master: "guile.texi"
-@c ispell-local-dictionary: "american"
-@c End:

+ 701 - 0
doc/chapters/advanced-api.texi

@@ -0,0 +1,701 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Defining a new data interface::  
+* Multiformat Data Interface::  
+* Task Bundles::                
+* Task Lists::                  
+* Using Parallel Tasks::        
+* Defining a new scheduling policy::  
+* Expert mode::                 
+@end menu
+
+@node Defining a new data interface
+@section Defining a new data interface
+
+@menu
+* Data Interface API::  Data Interface API
+* An example of data interface::        An example of data interface
+@end menu
+
+@node Data Interface API
+@subsection Data Interface API
+
+@deftp {Data Type} {struct starpu_data_interface_ops}
+@anchor{struct starpu_data_interface_ops}
+Per-interface data transfer methods.
+
+@table @asis
+@item @code{void (*register_data_handle)(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)}
+Register an existing interface into a data handle.
+
+@item @code{starpu_ssize_t (*allocate_data_on_node)(void *data_interface, uint32_t node)}
+Allocate data for the interface on a given node.
+
+@item @code{ void (*free_data_on_node)(void *data_interface, uint32_t node)}
+Free data of the interface on a given node.
+
+@item @code{ const struct starpu_data_copy_methods *copy_methods}
+ram/cuda/spu/opencl synchronous and asynchronous transfer methods.
+
+@item @code{ void * (*handle_to_pointer)(starpu_data_handle_t handle, uint32_t node)}
+Return the current pointer (if any) for the handle on the given node.
+
+@item @code{ size_t (*get_size)(starpu_data_handle_t handle)}
+Return an estimation of the size of data, for performance models.
+
+@item @code{ uint32_t (*footprint)(starpu_data_handle_t handle)}
+Return a 32bit footprint which characterizes the data size.
+
+@item @code{ int (*compare)(void *data_interface_a, void *data_interface_b)}
+Compare the data size of two interfaces.
+
+@item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
+Dump the sizes of a handle to a file.
+
+@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
+Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
+
+@item @code{enum starpu_data_interface_id interfaceid}
+An identifier that is unique to each interface.
+
+@item @code{size_t interface_size}
+The size of the interface data descriptor.
+
+@end table
+@end deftp
+
+@deftp {Data Type} {struct starpu_data_copy_methods}
+Defines the per-interface methods.
+@table @asis
+@item @code{int @{ram,cuda,opencl,spu@}_to_@{ram,cuda,opencl,spu@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 16 functions define how to copy data from the @var{src_interface}
+interface on the @var{src_node} node to the @var{dst_interface} interface
+on the @var{dst_node} node. They return 0 on success.
+
+@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
+@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
+on success.
+
+@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
+@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
+on success.
+
+@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
+the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
+Return 0 on success.
+
+@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
+@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
+cl_event. Return 0 on success.
+
+@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
+on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
+a cl_event. Return 0 on success.
+
+@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+Define how to copy data from the @var{src_interface} interface on the
+@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
+on the @var{dst_node} node (on another OpenCL device), using the given
+@var{event}, a pointer to a cl_event. Return 0 on success.
+@end table
+@end deftp
+
+@deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
+Compute the CRC of a byte buffer seeded by the inputcrc "current
+state". The return value should be considered as the new "current
+state" for future CRC computation. This is used for computing data size
+footprint.
+@end deftypefun
+
+@deftypefun uint32_t starpu_crc32_be (uint32_t @var{input}, uint32_t @var{inputcrc})
+Compute the CRC of a 32bit number seeded by the inputcrc "current
+state". The return value should be considered as the new "current
+state" for future CRC computation. This is used for computing data size
+footprint.
+@end deftypefun
+
+@deftypefun uint32_t starpu_crc32_string ({char *}@var{str}, uint32_t @var{inputcrc})
+Compute the CRC of a string seeded by the inputcrc "current state".
+The return value should be considered as the new "current state" for
+future CRC computation. This is used for computing data size footprint.
+@end deftypefun
+
+@node An example of data interface
+@subsection An example of data interface
+
+@deftypefun int starpu_data_interface_get_next_id ()
+Returns the next available id for a newly created data interface.
+@end deftypefun
+
+Let's define a new data interface to manage complex numbers.
+
+@cartouche
+@smallexample
+/* interface for complex numbers */
+struct starpu_complex_interface
+@{
+        double *real;
+        double *imaginary;
+        int nx;
+@};
+@end smallexample
+@end cartouche
+
+Registering such a data to StarPU is easily done using the function
+@code{starpu_data_register} (@pxref{Basic Data Library API}). The last
+parameter of the function, @code{interface_complex_ops}, will be
+described below.
+
+@cartouche
+@smallexample
+void starpu_complex_data_register(starpu_data_handle_t *handle,
+     uint32_t home_node, double *real, double *imaginary, int nx)
+@{
+        struct starpu_complex_interface complex =
+        @{
+                .real = real,
+                .imaginary = imaginary,
+                .nx = nx
+        @};
+
+        if (interface_complex_ops.interfaceid == -1)
+        @{
+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
+        @}
+
+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
+@}
+@end smallexample
+@end cartouche
+
+Different operations need to be defined for a data interface through
+the type @code{struct starpu_data_interface_ops} (@pxref{Data
+Interface API}). We only define here the basic operations needed to
+run simple applications. The source code for the different functions
+can be found in the file
+@code{examples/interface/complex_interface.c}.
+
+@cartouche
+@smallexample
+static struct starpu_data_interface_ops interface_complex_ops =
+@{
+        .register_data_handle = complex_register_data_handle,
+        .allocate_data_on_node = complex_allocate_data_on_node,
+        .copy_methods = &complex_copy_methods,
+        .get_size = complex_get_size,
+        .footprint = complex_footprint,
+        .interfaceid = -1,
+        .interface_size = sizeof(struct starpu_complex_interface),
+@};
+@end smallexample
+@end cartouche
+
+Functions need to be defined to access the different fields of the
+complex interface from a StarPU data handle.
+
+@cartouche
+@smallexample
+double *starpu_complex_get_real(starpu_data_handle_t handle)
+@{
+        struct starpu_complex_interface *complex_interface =
+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
+        return complex_interface->real;
+@}
+
+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
+int starpu_complex_get_nx(starpu_data_handle_t handle);
+@end smallexample
+@end cartouche
+
+Similar functions need to be defined to access the different fields of the
+complex interface from a @code{void *} pointer to be used within codelet
+implemetations.
+
+@cartouche
+@smallexample
+#define STARPU_COMPLEX_GET_REAL(interface)	\
+        (((struct starpu_complex_interface *)(interface))->real)
+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
+        (((struct starpu_complex_interface *)(interface))->imaginary)
+#define STARPU_COMPLEX_GET_NX(interface)	\
+        (((struct starpu_complex_interface *)(interface))->nx)
+@end smallexample
+@end cartouche
+
+Complex data interfaces can then be registered to StarPU.
+
+@cartouche
+@smallexample
+double real = 45.0;
+double imaginary = 12.0;
+starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
+starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
+@end smallexample
+@end cartouche
+
+and used by codelets.
+
+@cartouche
+@smallexample
+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+@{
+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
+        int i;
+
+        for(i=0 ; i<nx ; i++)
+        @{
+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
+        @}
+@}
+@end smallexample
+@end cartouche
+
+The whole code for this complex data interface is available in the
+directory @code{examples/interface/}.
+
+@node Multiformat Data Interface
+@section Multiformat Data Interface
+
+@deftp {Data Type} {struct starpu_multiformat_data_interface_ops}
+The different fields are:
+@table @asis
+@item @code{size_t cpu_elemsize}
+the size of each element on CPUs,
+
+@item @code{size_t opencl_elemsize}
+the size of each element on OpenCL devices,
+
+@item @code{struct starpu_codelet *cpu_to_opencl_cl}
+pointer to a codelet which converts from CPU to OpenCL
+
+@item @code{struct starpu_codelet *opencl_to_cpu_cl}
+pointer to a codelet which converts from OpenCL to CPU
+
+@item @code{size_t cuda_elemsize}
+the size of each element on CUDA devices,
+
+@item @code{struct starpu_codelet *cpu_to_cuda_cl}
+pointer to a codelet which converts from CPU to CUDA
+
+@item @code{struct starpu_codelet *cuda_to_cpu_cl}
+pointer to a codelet which converts from CUDA to CPU
+@end table
+@end deftp
+
+@deftypefun void starpu_multiformat_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, void *@var{ptr}, uint32_t @var{nobjects}, struct starpu_multiformat_data_interface_ops *@var{format_ops})
+Register a piece of data that can be represented in different ways, depending upon
+the processing unit that manipulates it. It allows the programmer, for instance, to
+use an array of structures when working on a CPU, and a structure of arrays when
+working on a GPU.
+
+@var{nobjects} is the number of elements in the data. @var{format_ops} describes
+the format.
+@end deftypefun
+
+@defmac STARPU_MULTIFORMAT_GET_CPU_PTR ({void *}@var{interface})
+returns the local pointer to the data with CPU format.
+@end defmac
+
+@defmac STARPU_MULTIFORMAT_GET_CUDA_PTR ({void *}@var{interface})
+returns the local pointer to the data with CUDA format.
+@end defmac
+
+@defmac STARPU_MULTIFORMAT_GET_OPENCL_PTR ({void *}@var{interface})
+returns the local pointer to the data with OpenCL format.
+@end defmac
+
+@defmac STARPU_MULTIFORMAT_GET_NX  ({void *}@var{interface})
+returns the number of elements in the data.
+@end defmac
+
+
+@node Task Bundles
+@section Task Bundles
+
+@deftp {Data Type} {starpu_task_bundle_t}
+Opaque structure describing a list of tasks that should be scheduled
+on the same worker whenever it's possible. It must be considered as a
+hint given to the scheduler as there is no guarantee that they will be
+executed on the same worker.
+@end deftp
+
+@deftypefun void starpu_task_bundle_create ({starpu_task_bundle_t *}@var{bundle})
+Factory function creating and initializing @var{bundle}, when the call returns, memory needed is allocated and @var{bundle} is ready to use.
+@end deftypefun
+
+@deftypefun int starpu_task_bundle_insert (starpu_task_bundle_t @var{bundle}, {struct starpu_task *}@var{task})
+Insert @var{task} in @var{bundle}. Until @var{task} is removed from @var{bundle} its expected length and data transfer time will be considered along those of the other tasks of @var{bundle}.
+This function mustn't be called if @var{bundle} is already closed and/or @var{task} is already submitted.
+@end deftypefun
+
+@deftypefun int starpu_task_bundle_remove (starpu_task_bundle_t @var{bundle}, {struct starpu_task *}@var{task})
+Remove @var{task} from @var{bundle}.
+Of course @var{task} must have been previously inserted @var{bundle}.
+This function mustn't be called if @var{bundle} is already closed and/or @var{task} is already submitted. Doing so would result in undefined behaviour.
+@end deftypefun
+
+@deftypefun void starpu_task_bundle_close (starpu_task_bundle_t @var{bundle})
+Inform the runtime that the user won't modify @var{bundle} anymore, it means no more inserting or removing task. Thus the runtime can destroy it when possible.
+@end deftypefun
+
+
+@node Task Lists
+@section Task Lists
+
+@deftp {Data Type} {struct starpu_task_list}
+Stores a double-chained list of tasks
+@end deftp
+
+@deftypefun void starpu_task_list_init ({struct starpu_task_list *}@var{list})
+Initialize a list structure
+@end deftypefun
+
+@deftypefun void starpu_task_list_push_front ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
+Push a task at the front of a list
+@end deftypefun
+
+@deftypefun void starpu_task_list_push_back ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
+Push a task at the back of a list
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_front ({struct starpu_task_list *}@var{list})
+Get the front of the list (without removing it)
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_back ({struct starpu_task_list *}@var{list})
+Get the back of the list (without removing it)
+@end deftypefun
+
+@deftypefun int starpu_task_list_empty ({struct starpu_task_list *}@var{list})
+Test if a list is empty
+@end deftypefun
+
+@deftypefun void starpu_task_list_erase ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
+Remove an element from the list
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_pop_front ({struct starpu_task_list *}@var{list})
+Remove the element at the front of the list
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_pop_back ({struct starpu_task_list *}@var{list})
+Remove the element at the back of the list
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_begin ({struct starpu_task_list *}@var{list})
+Get the first task of the list.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_end ({struct starpu_task_list *}@var{list})
+Get the end of the list.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpu_task_list_next ({struct starpu_task *}@var{task})
+Get the next task of the list. This is not erase-safe.
+@end deftypefun
+
+@node Using Parallel Tasks
+@section Using Parallel Tasks
+
+These are used by parallel tasks:
+
+@deftypefun int starpu_combined_worker_get_size (void)
+Return the size of the current combined worker, i.e. the total number of cpus
+running the same task in the case of SPMD parallel tasks, or the total number
+of threads that the task is allowed to start in the case of FORKJOIN parallel
+tasks.
+@end deftypefun
+
+@deftypefun int starpu_combined_worker_get_rank (void)
+Return the rank of the current thread within the combined worker. Can only be
+used in FORKJOIN parallel tasks, to know which part of the task to work on.
+@end deftypefun
+
+Most of these are used for schedulers which support parallel tasks.
+
+@deftypefun unsigned starpu_combined_worker_get_count (void)
+Return the number of different combined workers.
+@end deftypefun
+
+@deftypefun int starpu_combined_worker_get_id (void)
+Return the identifier of the current combined worker.
+@end deftypefun
+
+@deftypefun int starpu_combined_worker_assign_workerid (int @var{nworkers}, int @var{workerid_array}[])
+Register a new combined worker and get its identifier
+@end deftypefun
+
+@deftypefun int starpu_combined_worker_get_description (int @var{workerid}, {int *}@var{worker_size}, {int **}@var{combined_workerid})
+Get the description of a combined worker
+@end deftypefun
+
+@deftypefun int starpu_combined_worker_can_execute_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned @var{nimpl})
+Variant of starpu_worker_can_execute_task compatible with combined workers
+@end deftypefun
+
+
+@node Defining a new scheduling policy
+@section Defining a new scheduling policy
+
+TODO
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory @code{examples/scheduler/}.
+
+@menu
+* Scheduling Policy API:: Scheduling Policy API
+* Source code::
+@end menu
+
+@node Scheduling Policy API
+@subsection Scheduling Policy API
+
+While StarPU comes with a variety of scheduling policies (@pxref{Task
+scheduling policy}), it may sometimes be desirable to implement custom
+policies to address specific problems.  The API described below allows
+users to write their own scheduling policy.
+
+@deftp {Data Type} {struct starpu_machine_topology}
+@table @asis
+@item @code{unsigned nworkers}
+Total number of workers.
+
+@item @code{unsigned ncombinedworkers}
+Total number of combined workers.
+
+@item @code{hwloc_topology_t hwtopology}
+Topology as detected by hwloc.
+
+To maintain ABI compatibility when hwloc is not available, the field
+is replaced with @code{void *dummy}
+
+@item @code{unsigned nhwcpus}
+Total number of CPUs, as detected by the topology code. May be different from
+the actual number of CPU workers.
+
+@item @code{unsigned nhwcudagpus}
+Total number of CUDA devices, as detected. May be different from the actual
+number of CUDA workers.
+
+@item @code{unsigned nhwopenclgpus}
+Total number of OpenCL devices, as detected. May be different from the actual
+number of CUDA workers.
+
+@item @code{unsigned ncpus}
+Actual number of CPU workers used by StarPU.
+
+@item @code{unsigned ncudagpus}
+Actual number of CUDA workers used by StarPU.
+
+@item @code{unsigned nopenclgpus}
+Actual number of OpenCL workers used by StarPU.
+
+@item @code{unsigned ngordon_spus}
+Actual number of Gordon workers used by StarPU.
+
+@item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
+Indicates the successive cpu identifier that should be used to bind the
+workers. It is either filled according to the user's explicit
+parameters (from starpu_conf) or according to the STARPU_WORKERS_CPUID env.
+variable. Otherwise, a round-robin policy is used to distributed the workers
+over the cpus.
+
+@item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
+Indicates the successive cpu identifier that should be used by the CUDA
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
+they are taken in ID order.
+
+@item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
+Indicates the successive cpu identifier that should be used by the OpenCL
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
+they are taken in ID order.
+
+
+@end table
+@end deftp
+
+@deftp {Data Type} {struct starpu_sched_policy}
+This structure contains all the methods that implement a scheduling policy.  An
+application may specify which scheduling strategy in the @code{sched_policy}
+field of the @code{starpu_conf} structure passed to the @code{starpu_init}
+function. The different fields are:
+
+@table @asis
+@item @code{void (*init_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+Initialize the scheduling policy.
+
+@item @code{void (*deinit_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
+Cleanup the scheduling policy.
+
+@item @code{int (*push_task)(struct starpu_task *)}
+Insert a task into the scheduler.
+
+@item @code{void (*push_task_notify)(struct starpu_task *, int workerid)}
+Notify the scheduler that a task was pushed on a given worker. This method is
+called when a task that was explicitely assigned to a worker becomes ready and
+is about to be executed by the worker. This method therefore permits to keep
+the state of of the scheduler coherent even when StarPU bypasses the scheduling
+strategy.
+
+@item @code{struct starpu_task *(*pop_task)(void)} (optional)
+Get a task from the scheduler. The mutex associated to the worker is already
+taken when this method is called. If this method is defined as @code{NULL}, the
+worker will only execute tasks from its local queue. In this case, the
+@code{push_task} method should use the @code{starpu_push_local_task} method to
+assign tasks to the different workers.
+
+@item @code{struct starpu_task *(*pop_every_task)(void)}
+Remove all available tasks from the scheduler (tasks are chained by the means
+of the prev and next fields of the starpu_task structure). The mutex associated
+to the worker is already taken when this method is called. This is currently
+only used by the Gordon driver.
+
+@item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
+This method is called every time a task is starting.
+
+@item @code{void (*post_exec_hook)(struct starpu_task *)} (optional)
+This method is called every time a task has been executed.
+
+@item @code{const char *policy_name} (optional)
+Name of the policy.
+
+@item @code{const char *policy_description} (optional)
+Description of the policy.
+@end table
+@end deftp
+
+@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
+This function specifies the condition variable associated to a worker
+When there is no available task for a worker, StarPU blocks this worker on a
+condition variable. This function specifies which condition variable (and the
+associated mutex) should be used to block (and to wake up) a worker. Note that
+multiple workers may use the same condition variable. For instance, in the case
+of a scheduling strategy with a single task queue, the same condition variable
+would be used to block and wake up all workers.
+The initialization method of a scheduling strategy (@code{init_sched}) must
+call this function once per worker.
+@end deftypefun
+
+@deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
+Defines the minimum priority level supported by the scheduling policy. The
+default minimum priority level is the same as the default priority level which
+is 0 by convention.  The application may access that value by calling the
+@code{starpu_sched_get_min_priority} function. This function should only be
+called from the initialization method of the scheduling policy, and should not
+be used directly from the application.
+@end deftypefun
+
+@deftypefun void starpu_sched_set_max_priority (int @var{max_prio})
+Defines the maximum priority level supported by the scheduling policy. The
+default maximum priority level is 1.  The application may access that value by
+calling the @code{starpu_sched_get_max_priority} function. This function should
+only be called from the initialization method of the scheduling policy, and
+should not be used directly from the application.
+@end deftypefun
+
+@deftypefun int starpu_sched_get_min_priority (void)
+Returns the current minimum priority level supported by the
+scheduling policy
+@end deftypefun
+
+@deftypefun int starpu_sched_get_max_priority (void)
+Returns the current maximum priority level supported by the
+scheduling policy
+@end deftypefun
+
+@deftypefun int starpu_push_local_task (int @var{workerid}, {struct starpu_task} *@var{task}, int @var{back})
+The scheduling policy may put tasks directly into a worker's local queue so
+that it is not always necessary to create its own queue when the local queue
+is sufficient. If @var{back} not null, @var{task} is put at the back of the queue
+where the worker will pop tasks first. Setting @var{back} to 0 therefore ensures
+a FIFO ordering.
+@end deftypefun
+
+@deftypefun int starpu_worker_can_execute_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned {nimpl})
+Check if the worker specified by workerid can execute the codelet. Schedulers need to call it before assigning a task to a worker, otherwise the task may fail to execute.
+@end deftypefun
+
+@deftypefun double starpu_timing_now (void)
+Return the current date in µs
+@end deftypefun
+
+@deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+Returns expected task duration in µs
+@end deftypefun
+
+@deftypefun double starpu_worker_get_relative_speedup ({enum starpu_perf_archtype} @var{perf_archtype})
+Returns an estimated speedup factor relative to CPU speed
+@end deftypefun
+
+@deftypefun double starpu_task_expected_data_transfer_time (uint32_t @var{memory_node}, {struct starpu_task *}@var{task})
+Returns expected data transfer time in µs
+@end deftypefun
+
+@deftypefun double starpu_data_expected_transfer_time (starpu_data_handle_t @var{handle}, unsigned @var{memory_node}, {enum starpu_access_mode} @var{mode})
+Predict the transfer time (in µs) to move a handle to a memory node
+@end deftypefun
+
+@deftypefun double starpu_task_expected_power ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+Returns expected power consumption in J
+@end deftypefun
+
+@deftypefun double starpu_task_expected_conversion_time ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned {nimpl})
+Returns expected conversion time in ms (multiformat interface only)
+@end deftypefun
+
+@node Source code
+@subsection Source code
+
+@cartouche
+@smallexample
+static struct starpu_sched_policy dummy_sched_policy = @{
+    .init_sched = init_dummy_sched,
+    .deinit_sched = deinit_dummy_sched,
+    .push_task = push_task_dummy,
+    .push_prio_task = NULL,
+    .pop_task = pop_task_dummy,
+    .post_exec_hook = NULL,
+    .pop_every_task = NULL,
+    .policy_name = "dummy",
+    .policy_description = "dummy scheduling strategy"
+@};
+@end smallexample
+@end cartouche
+
+@node Expert mode
+@section Expert mode
+
+@deftypefun void starpu_wake_all_blocked_workers (void)
+Wake all the workers, so they can inspect data requests and task submissions
+again.
+@end deftypefun
+
+@deftypefun int starpu_progression_hook_register (unsigned (*@var{func})(void *arg), void *@var{arg})
+Register a progression hook, to be called when workers are idle.
+@end deftypefun
+
+@deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
+Unregister a given progression hook.
+@end deftypefun
+

+ 886 - 0
doc/chapters/advanced-examples.texi

@@ -0,0 +1,886 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Using multiple implementations of a codelet::
+* Enabling implementation according to capabilities::
+* Task and Worker Profiling::   
+* Partitioning Data::           Partitioning Data
+* Performance model example::   
+* Theoretical lower bound on execution time::  
+* Insert Task Utility::          
+* Parallel Tasks::
+* Debugging::
+* The multiformat interface::
+* On-GPU rendering::
+* More examples::               More examples shipped with StarPU
+@end menu
+
+@node Using multiple implementations of a codelet
+@section Using multiple implementations of a codelet
+One may want to write multiple implementations of a codelet for a single type of
+device and let StarPU choose which one to run. As an example, we will show how
+to use SSE to scale a vector. The codelet can be written as follows:
+
+@cartouche
+@smallexample
+#include <xmmintrin.h>
+
+void scal_sse_func(void *buffers[], void *cl_arg)
+@{
+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+    unsigned int n_iterations = n/4;
+    if (n % 4 != 0)
+        n_iterations++;
+
+    __m128 *VECTOR = (__m128*) vector;
+    __m128 factor __attribute__((aligned(16)));
+    factor = _mm_set1_ps(*(float *) cl_arg);
+
+    unsigned int i;    
+    for (i = 0; i < n_iterations; i++)
+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
+@}
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+struct starpu_codelet cl = @{
+    .where = STARPU_CPU,
+    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+@end smallexample
+@end cartouche
+
+Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
+and @code{pheft} for now) will use the performance models of all the
+implementations it was given, and pick the one that seems to be the fastest.
+
+@node Enabling implementation according to capabilities
+@section Enabling implementation according to capabilities
+
+Some implementations may not run on some devices. For instance, some CUDA
+devices do not support double floating point precision, and thus the kernel
+execution would just fail; or the device may not have enough shared memory for
+the implementation being used. The @code{can_execute} field of the @code{struct
+starpu_codelet} structure permits to express this. For instance:
+
+@cartouche
+@smallexample
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+@{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 3)
+    /* At least compute capability 1.3, supports doubles */
+    return 1;
+  /* Old card, does not support doubles */
+  return 0;
+@}
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CPU|STARPU_CUDA,
+    .can_execute = can_execute,
+    .cpu_funcs = @{ cpu_func, NULL @},
+    .cuda_funcs = @{ gpu_func, NULL @}
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+@end smallexample
+@end cartouche
+
+This can be essential e.g. when running on a machine which mixes various models
+of CUDA devices, to take benefit from the new models without crashing on old models.
+
+Note: the @code{can_execute} function is called by the scheduler each time it
+tries to match a task with a worker, and should thus be very fast. The
+@code{starpu_cuda_get_device_properties} provides a quick access to CUDA
+properties of CUDA devices to achieve such efficiency.
+
+Another example is compiling CUDA code for various compute capabilities,
+resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
+1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
+provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
+used to rule out the @code{scal_gpu_20} variant on a CUDA device which
+will not be able to execute it:
+
+@cartouche
+@smallexample
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+@{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  if (nimpl == 0)
+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
+    return 1;
+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 0)
+    /* At least compute capability 2.0, can run it */
+    return 1;
+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
+  return 0;
+@}
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CPU|STARPU_CUDA,
+    .can_execute = can_execute,
+    .cpu_funcs = @{ cpu_func, NULL @},
+    .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+@end smallexample
+@end cartouche
+
+Note: the most generic variant should be provided first, as some schedulers are
+not able to try the different variants.
+
+@node Task and Worker Profiling
+@section Task and Worker Profiling
+
+A full example showing how to use the profiling API is available in
+the StarPU sources in the directory @code{examples/profiling/}.
+
+@cartouche
+@smallexample
+struct starpu_task *task = starpu_task_create();
+task->cl = &cl;
+task->synchronous = 1;
+/* We will destroy the task structure by hand so that we can
+ * query the profiling info before the task is destroyed. */
+task->destroy = 0;
+
+/* Submit and wait for completion (since synchronous was set to 1) */
+starpu_task_submit(task);
+
+/* The task is finished, get profiling information */
+struct starpu_task_profiling_info *info = task->profiling_info;
+
+/* How much time did it take before the task started ? */
+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
+
+/* How long was the task execution ? */
+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+
+/* We don't need the task structure anymore */
+starpu_task_destroy(task);
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+/* Display the occupancy of all workers during the test */
+int worker;
+for (worker = 0; worker < starpu_worker_get_count(); worker++)
+@{
+        struct starpu_worker_profiling_info worker_info;
+        int ret = starpu_worker_get_profiling_info(worker, &worker_info);
+        STARPU_ASSERT(!ret);
+
+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
+
+        float executing_ratio = 100.0*executing_time/total_time;
+        float sleeping_ratio = 100.0*sleeping_time/total_time;
+
+        char workername[128];
+        starpu_worker_get_name(worker, workername, 128);
+        fprintf(stderr, "Worker %s:\n", workername);
+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,
+                executing_ratio);
+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
+                sleeping_ratio);
+@}
+@end smallexample
+@end cartouche
+
+@node Partitioning Data
+@section Partitioning Data
+
+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
+
+@cartouche
+@smallexample
+int vector[NX];
+starpu_data_handle_t handle;
+
+/* Declare data to StarPU */
+starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
+
+/* Partition the vector in PARTS sub-vectors */
+starpu_filter f =
+@{
+    .filter_func = starpu_block_filter_func_vector,
+    .nchildren = PARTS
+@};
+starpu_data_partition(handle, &f);
+@end smallexample
+@end cartouche
+
+The task submission then uses @code{starpu_data_get_sub_data} to retrive the
+sub-handles to be passed as tasks parameters.
+
+@cartouche
+@smallexample
+/* Submit a task on each sub-vector */
+for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
+    /* Get subdata number i (there is only 1 dimension) */
+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
+    struct starpu_task *task = starpu_task_create();
+
+    task->handles[0] = sub_handle;
+    task->cl = &cl;
+    task->synchronous = 1;
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+
+    starpu_task_submit(task);
+@}
+@end smallexample
+@end cartouche
+
+Partitioning can be applied several times, see
+@code{examples/basic_examples/mult.c} and @code{examples/filters/}.
+
+Wherever the whole piece of data is already available, the partitioning will
+be done in-place, i.e. without allocating new buffers but just using pointers
+inside the existing copy. This is particularly important to be aware of when
+using OpenCL, where the kernel parameters are not pointers, but handles. The
+kernel thus needs to be also passed the offset within the OpenCL buffer:
+
+@cartouche
+@smallexample
+void opencl_func(void *buffers[], void *cl_arg)
+@{
+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
+
+    ...
+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
+    ...
+@}
+@end smallexample
+@end cartouche
+
+And the kernel has to shift from the pointer passed by the OpenCL driver:
+
+@cartouche
+@smallexample
+__kernel void opencl_kernel(__global int *vector, unsigned offset)
+@{
+    block = (__global void *)block + offset;
+    ...
+@}
+@end smallexample
+@end cartouche
+
+@node Performance model example
+@section Performance model example
+
+To achieve good scheduling, StarPU scheduling policies need to be able to
+estimate in advance the duration of a task. This is done by giving to codelets
+a performance model, by defining a @code{starpu_perfmodel} structure and
+providing its address in the @code{model} field of the @code{struct starpu_codelet}
+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
+are mandatory, to give a name to the model, and the type of the model, since
+there are several kinds of performance models.
+
+@itemize
+@item
+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
+given set of data input/output sizes, the performance will always be about the
+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
+and just a bit less true on CPUs (~=1% error). This also assumes that there are
+few different sets of data input/output sizes. StarPU will then keep record of
+the average time of previous executions on the various processing units, and use
+it as an estimation. History is done per task size, by using a hash of the input
+and ouput sizes as an index.
+It will also save it in @code{~/.starpu/sampling/codelets}
+for further executions, and can be observed by using the
+@code{starpu_perfmodel_display} command, or drawn by using
+the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
+share the models between machines (e.g. for a homogeneous cluster), use
+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
+
+The following is a small code example.
+
+If e.g. the code is recompiled with other compilation options, or several
+variants of the code are used, the symbol string should be changed to reflect
+that, in order to recalibrate a new model from zero. The symbol string can even
+be constructed dynamically at execution time, as long as this is done before
+submitting any task using it.
+
+@cartouche
+@smallexample
+static struct starpu_perfmodel mult_perf_model = @{
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "mult_perf_model"
+@};
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CPU,
+    .cpu_funcs = @{ cpu_mult, NULL @},
+    .nbuffers = 3,
+    .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
+    /* for the scheduling policy to be able to use performance models */
+    .model = &mult_perf_model
+@};
+@end smallexample
+@end cartouche
+
+@item
+Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
+model type). This still assumes performance regularity, but can work
+with various data input sizes, by applying regression over observed
+execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
+form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
+STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
+@code{tests/perfmodels/regression_based.c} uses a regression-based performance
+model for the @code{memset} operation. Of course, the application has to issue
+tasks with varying size so that the regression can be computed. StarPU will not
+trust the regression unless there is at least 10% difference between the minimum
+and maximum observed input size. For non-linear regression, since computing it
+is quite expensive, it is only done at termination of the application. This
+means that the first execution uses history-based performance model to perform
+scheduling.
+
+@item
+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
+see for instance
+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
+
+@item
+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
+@code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers to
+functions which return the expected duration of the task in micro-seconds, one
+per architecture.
+
+@end itemize
+
+For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
+the total size of task data (both input and output) is used as an index by
+default. The @code{size_base} field of @code{struct starpu_perfmodel} however
+permits the application to override that, when for instance some of the data
+do not matter for task cost (e.g. mere reference table), or when using sparse
+structures (in which case it is the number of non-zeros which matter), or when
+there is some hidden parameter such as the number of iterations, etc.
+
+How to use schedulers which can benefit from such performance model is explained
+in @ref{Task scheduling policy}.
+
+The same can be done for task power consumption estimation, by setting the
+@code{power_model} field the same way as the @code{model} field. Note: for
+now, the application has to give to the power consumption performance model
+a name which is different from the execution time performance model.
+
+The application can request time estimations from the StarPU performance
+models by filling a task structure as usual without actually submitting
+it. The data handles can be created by calling @code{starpu_data_register}
+functions with a @code{NULL} pointer (and need to be unregistered as usual)
+and the desired data sizes. The @code{starpu_task_expected_length} and
+@code{starpu_task_expected_power} functions can then be called to get an
+estimation of the task duration on a given arch. @code{starpu_task_destroy}
+needs to be called to destroy the dummy task afterwards. See
+@code{tests/perfmodels/regression_based.c} for an example.
+
+@node Theoretical lower bound on execution time
+@section Theoretical lower bound on execution time
+
+For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
+bound for the execution time of a whole set of tasks. See for
+instance @code{examples/lu/lu_example.c}: before submitting tasks,
+call @code{starpu_bound_start}, and after complete execution, call
+@code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
+@code{starpu_bound_print_mps} can then be used to output a Linear Programming
+problem corresponding to the schedule of your tasks. Run it through
+@code{lp_solve} or any other linear programming solver, and that will give you a
+lower bound for the total execution time of your tasks. If StarPU was compiled
+with the glpk library installed, @code{starpu_bound_compute} can be used to
+solve it immediately and get the optimized minimum, in ms. Its @code{integer}
+parameter allows to decide whether integer resolution should be computed
+and returned too.
+
+The @code{deps} parameter tells StarPU whether to take tasks and implicit data
+dependencies into account. It must be understood that the linear programming
+problem size is quadratic with the number of tasks and thus the time to solve it
+will be very long, it could be minutes for just a few dozen tasks. You should
+probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
+problem to MPS format and then use a better solver, @code{glpsol} might be
+better than @code{lp_solve} for instance (the @code{--pcost} option may be
+useful), but sometimes doesn't manage to converge. @code{cbc} might look
+slower, but it is parallel. Be sure to try at least all the @code{-B} options
+of @code{lp_solve}. For instance, we often just use
+@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
+the @code{-gr} option can also be quite useful.
+
+Setting @code{deps} to 0 will only take into account the actual computations
+on processing units. It however still properly takes into account the varying
+performances of kernels and processing units, which is quite more accurate than
+just comparing StarPU performances with the fastest of the kernels being used.
+
+The @code{prio} parameter tells StarPU whether to simulate taking into account
+the priorities as the StarPU scheduler would, i.e. schedule prioritized
+tasks before less prioritized tasks, to check to which extend this results
+to a less optimal solution. This increases even more computation time.
+
+Note that for simplicity, all this however doesn't take into account data
+transfers, which are assumed to be completely overlapped.
+
+@node Insert Task Utility
+@section Insert Task Utility
+
+StarPU provides the wrapper function @code{starpu_insert_task} to ease
+the creation and submission of tasks.
+
+@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
+Create and submit a task corresponding to @var{cl} with the following
+arguments.  The argument list must be zero-terminated.
+
+The arguments following the codelets can be of the following types:
+
+@itemize
+@item
+@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
+@item
+the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
+@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
+@code{STARPU_PRIORITY}, followed by the appropriated objects as
+defined below.
+@end itemize
+
+Parameters to be passed to the codelet implementation are defined
+through the type @code{STARPU_VALUE}. The function
+@code{starpu_codelet_unpack_args} must be called within the codelet
+implementation to retrieve them.
+@end deftypefun
+
+@defmac STARPU_VALUE
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to a constant value and the size of the constant
+@end defmac
+
+@defmac STARPU_CALLBACK
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to a callback function
+@end defmac
+
+@defmac STARPU_CALLBACK_ARG
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a pointer to be given as an argument to the callback
+function
+@end defmac
+
+@defmac  STARPU_CALLBACK_WITH_ARG
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by two pointers: one to a callback function, and the other to
+be given as an argument to the callback function; this is equivalent
+to using both @code{STARPU_CALLBACK} and
+@code{STARPU_CALLBACK_WITH_ARG}
+@end defmac
+
+@defmac STARPU_PRIORITY
+this macro is used when calling @code{starpu_insert_task}, and must be
+followed by a integer defining a priority level
+@end defmac
+
+@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
+Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
+given to a codelet and later unpacked with the function
+@code{starpu_codelet_unpack_args} defined below.
+@end deftypefun
+
+@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
+Retrieve the arguments of type @code{STARPU_VALUE} associated to a
+task automatically created using the function
+@code{starpu_insert_task} defined above.
+@end deftypefun
+
+Here the implementation of the codelet:
+
+@smallexample
+void func_cpu(void *descr[], void *_args)
+@{
+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        int ifactor;
+        float ffactor;
+
+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
+        *x0 = *x0 * ifactor;
+        *x1 = *x1 * ffactor;
+@}
+
+struct starpu_codelet mycodelet = @{
+        .where = STARPU_CPU,
+        .cpu_funcs = @{ func_cpu, NULL @},
+        .nbuffers = 2,
+        .modes = @{ STARPU_RW, STARPU_RW @}
+@};
+@end smallexample
+
+And the call to the @code{starpu_insert_task} wrapper:
+
+@smallexample
+starpu_insert_task(&mycodelet,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+                   0);
+@end smallexample
+
+The call to @code{starpu_insert_task} is equivalent to the following
+code:
+
+@smallexample
+struct starpu_task *task = starpu_task_create();
+task->cl = &mycodelet;
+task->handles[0] = data_handles[0];
+task->handles[1] = data_handles[1];
+char *arg_buffer;
+size_t arg_buffer_size;
+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
+                    0);
+task->cl_arg = arg_buffer;
+task->cl_arg_size = arg_buffer_size;
+int ret = starpu_task_submit(task);
+@end smallexample
+
+If some part of the task insertion depends on the value of some computation,
+the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
+instance, assuming that the index variable @code{i} was registered as handle
+@code{i_handle}:
+
+@smallexample
+/* Compute which portion we will work on, e.g. pivot */
+starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
+
+/* And submit the corresponding task */
+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
+@end smallexample
+
+The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
+acquiring data @code{i} for the main application, and will execute the code
+given as third parameter when it is acquired. In other words, as soon as the
+value of @code{i} computed by the @code{which_index} codelet can be read, the
+portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
+be executed, and is allowed to read from @code{i} to use it e.g. as an
+index. Note that this macro is only avaible when compiling StarPU with
+the compiler @code{gcc}.
+
+@node Parallel Tasks
+@section Parallel Tasks
+
+StarPU can leverage existing parallel computation libraries by the means of
+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
+(called a parallel or combined worker) at the same time, by using an existing
+parallel CPU implementation of the computation to be achieved. This can also be
+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
+work collectively on a single task, the completion time of tasks on CPUs become
+comparable to the completion time on GPUs, thus relieving from granularity
+discrepancy concerns.
+
+Two modes of execution exist to accomodate with existing usages.
+
+@subsection Fork-mode parallel tasks
+
+In the Fork mode, StarPU will call the codelet function on one
+of the CPUs of the combined worker. The codelet function can use
+@code{starpu_combined_worker_get_size()} to get the number of threads it is
+allowed to start to achieve the computation. The CPU binding mask is already
+enforced, so that threads created by the function will inherit the mask, and
+thus execute where StarPU expected. For instance, using OpenMP (full source is
+available in @code{examples/openmp/vector_scal.c}):
+
+@example
+void scal_cpu_func(void *buffers[], void *_args)
+@{
+    unsigned i;
+    float *factor = _args;
+    struct starpu_vector_interface *vector = buffers[0];
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
+    for (i = 0; i < n; i++)
+        val[i] *= *factor;
+@}
+
+static struct starpu_codelet cl =
+@{
+    .modes = @{ STARPU_RW @},
+    .where = STARPU_CPU,
+    .type = STARPU_FORKJOIN,
+    .max_parallelism = INT_MAX,
+    .cpu_funcs = @{scal_cpu_func, NULL@},
+    .nbuffers = 1,
+@};
+@end example
+
+Other examples include for instance calling a BLAS parallel CPU implementation
+(see @code{examples/mult/xgemm.c}).
+
+@subsection SPMD-mode parallel tasks
+
+In the SPMD mode, StarPU will call the codelet function on
+each CPU of the combined worker. The codelet function can use
+@code{starpu_combined_worker_get_size()} to get the total number of CPUs
+involved in the combined worker, and thus the number of calls that are made in
+parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
+the rank of the current CPU within the combined worker. For instance:
+
+@example
+static void func(void *buffers[], void *args)
+@{
+    unsigned i;
+    float *factor = _args;
+    struct starpu_vector_interface *vector = buffers[0];
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* Compute slice to compute */
+    unsigned m = starpu_combined_worker_get_size();
+    unsigned j = starpu_combined_worker_get_rank();
+    unsigned slice = (n+m-1)/m;
+
+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
+        val[i] *= *factor;
+@}
+
+static struct starpu_codelet cl =
+@{
+    .modes = @{ STARPU_RW @},
+    .where = STARP_CPU,
+    .type = STARPU_SPMD,
+    .max_parallelism = INT_MAX,
+    .cpu_funcs = @{ func, NULL @},
+    .nbuffers = 1,
+@}
+@end example
+
+Of course, this trivial example will not really benefit from parallel task
+execution, and was only meant to be simple to understand.  The benefit comes
+when the computation to be done is so that threads have to e.g. exchange
+intermediate results, or write to the data in a complex but safe way in the same
+buffer.
+
+@subsection Parallel tasks performance
+
+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
+be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
+(parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed also
+try to execute tasks with several CPUs. It will automatically try the various
+available combined worker sizes and thus be able to avoid choosing a large
+combined worker if the codelet does not actually scale so much.
+
+@subsection Combined worker sizes
+
+By default, StarPU creates combined workers according to the architecture
+structure as detected by hwloc. It means that for each object of the hwloc
+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
+without a hierarchy of shared caches), StarPU will create combined workers of
+intermediate sizes.
+
+@subsection Concurrent parallel tasks
+
+Unfortunately, many environments and librairies do not support concurrent
+calls.
+
+For instance, most OpenMP implementations (including the main ones) do not
+support concurrent @code{pragma omp parallel} statements without nesting them in
+another @code{pragma omp parallel} statement, but StarPU does not yet support
+creating its CPU workers by using such pragma.
+
+Other parallel libraries are also not safe when being invoked concurrently
+from different threads, due to the use of global variables in their sequential
+sections for instance.
+
+The solution is then to use only one combined worker at a time.  This can be
+done by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}
+structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environment
+variable to 1. StarPU will then run only one parallel task at a time.
+
+@node Debugging
+@section Debugging
+
+StarPU provides several tools to help debugging aplications. Execution traces
+can be generated and displayed graphically, see @ref{Generating traces}. Some
+gdb helpers are also provided to show the whole StarPU state:
+
+@smallexample
+(gdb) source tools/gdbinit
+(gdb) help starpu
+@end smallexample
+
+@node The multiformat interface
+@section The multiformat interface
+It may be interesting to represent the same piece of data using two different
+data structures: one that would only be used on CPUs, and one that would only
+be used on GPUs. This can be done by using the multiformat interface. StarPU
+will be able to convert data from one data structure to the other when needed.
+Note that the heft scheduler is the only one optimized for this interface. The
+user must provide StarPU with conversion codelets:
+
+@cartouche
+@smallexample
+#define NX 1024
+struct point array_of_structs[NX];
+starpu_data_handle_t handle;
+
+/*
+ * The conversion of a piece of data is itself a task, though it is created,
+ * submitted and destroyed by StarPU internals and not by the user. Therefore,
+ * we have to define two codelets.
+ * Note that for now the conversion from the CPU format to the GPU format has to
+ * be executed on the GPU, and the conversion from the GPU to the CPU has to be
+ * executed on the CPU.
+ */
+#ifdef STARPU_USE_OPENCL
+void cpu_to_opencl_opencl_func(void *buffers[], void *args);
+struct starpu_codelet cpu_to_opencl_cl = @{
+    .where = STARPU_OPENCL,
+    .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+
+void opencl_to_cpu_func(void *buffers[], void *args);
+struct starpu_codelet opencl_to_cpu_cl = @{
+    .where = STARPU_CPU,
+    .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+#endif
+
+struct starpu_multiformat_data_interface_ops format_ops = @{
+#ifdef STARPU_USE_OPENCL
+    .opencl_elemsize = 2 * sizeof(float),
+    .cpu_to_opencl_cl = &cpu_to_opencl_cl,
+    .opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
+    .cpu_elemsize = 2 * sizeof(float),
+    ...
+@};
+starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
+@end smallexample
+@end cartouche
+
+Kernels can be written almost as for any other interface. Note that
+STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
+must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
+STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
+kind of kernel.
+@cartouche
+@smallexample
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+@{
+    struct point *aos;
+    unsigned int n;
+
+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+    ...
+@}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+@{
+    unsigned int n;
+    struct struct_of_arrays *soa;
+
+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+    ...
+@}
+@end smallexample
+@end cartouche
+
+A full example may be found in @code{examples/basic_examples/multiformat.c}.
+
+@node On-GPU rendering
+@section On-GPU rendering
+
+Graphical-oriented applications need to draw the result of their computations,
+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
+interoperability permit to let CUDA directly work on the OpenGL buffers, making
+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
+renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
+be given the CUDA pointer at registration, for instance:
+
+@cartouche
+@smallexample
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
+                break;
+
+cudaSetDevice(starpu_worker_get_devid(workerid));
+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
+
+starpu_insert_task(&cl, STARPU_RW, handle, 0);
+
+starpu_data_unregister(handle);
+
+cudaSetDevice(starpu_worker_get_devid(workerid));
+cudaGraphicsUnmapResources(1, &resource, 0);
+
+/* Now display it */
+@end smallexample
+@end cartouche
+
+@node More examples
+@section More examples
+
+
+More examples are available in the StarPU sources in the @code{examples/}
+directory. Simple examples include:
+
+@table @asis
+@item @code{incrementer/}:
+    Trivial incrementation test.
+@item @code{basic_examples/}:
+        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
+        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
+        product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
+        interface, an example using the variable data interface, and an example
+        using different formats on CPUs and GPUs.
+@item @code{matvecmult/}:
+    OpenCL example from NVidia, adapted to StarPU.
+@item @code{axpy/}:
+    AXPY CUBLAS operation adapted to StarPU.
+@item @code{fortran/}:
+    Example of Fortran bindings.
+@end table
+
+More advanced examples include:
+
+@table @asis
+@item @code{filters/}:
+    Examples using filters, as shown in @ref{Partitioning Data}.
+@item @code{lu/}:
+    LU matrix factorization, see for instance @code{xlu_implicit.c}
+@item @code{cholesky/}:
+    Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
+@end table

File diff suppressed because it is too large
+ 2273 - 0
doc/chapters/basic-api.texi


+ 989 - 0
doc/chapters/basic-examples.texi

@@ -0,0 +1,989 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Compiling and linking options::  
+* Hello World::                 Submitting Tasks
+* Vector Scaling Using the C Extension::  
+* Vector Scaling Using StarPu's API::  
+* Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
+@end menu
+
+@node Compiling and linking options
+@section Compiling and linking options
+
+Let's suppose StarPU has been installed in the directory
+@code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
+the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
+necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
+libraries at runtime.
+
+@example
+% PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
+% LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
+@end example
+
+The Makefile could for instance contain the following lines to define which
+options must be given to the compiler and to the linker:
+
+@cartouche
+@example
+CFLAGS          +=      $$(pkg-config --cflags starpu-1.0)
+LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
+@end example
+@end cartouche
+
+Also pass the @code{--static} option if the application is to be linked statically.
+
+@node Hello World
+@section Hello World
+
+This section shows how to implement a simple program that submits a task
+to StarPU. You can either use the StarPU C extension (@pxref{C
+Extensions}) or directly use the StarPU's API.
+
+@menu
+* Hello World using the C Extension::  
+* Hello World using StarPU's API::  
+@end menu
+
+@node Hello World using the C Extension
+@subsection Hello World using the C Extension
+
+Writing a task is both simpler and less error-prone when using the C
+extensions implemented by StarPU's GCC plug-in (@pxref{C Extensions}).
+In a nutshell, all it takes is to declare a task, declare and define its
+implementations (for CPU, OpenCL, and/or CUDA), and invoke the task like
+a regular C function.  The example below defines @code{my_task}, which
+has a single implementation for CPU:
+
+@cartouche
+@smallexample
+/* Task declaration.  */
+static void my_task (int x) __attribute__ ((task));
+
+/* Declaration of the CPU implementation of `my_task'.  */
+static void my_task_cpu (int x) __attribute__ ((task_implementation ("cpu", my_task)));
+
+/* Definition of said CPU implementation.  */
+static void my_task_cpu (int x)
+@{
+  printf ("Hello, world!  With x = %d\n", x);
+@}
+
+int main ()
+@{
+  /* Initialize StarPU.  */
+#pragma starpu initialize
+
+  /* Do an asynchronous call to `my_task'.  */
+  my_task (42);
+
+  /* Wait for the call to complete.  */
+#pragma starpu wait
+
+  /* Terminate.  */
+#pragma starpu shutdown
+
+  return 0;
+@}
+@end smallexample
+@end cartouche
+
+@noindent
+The code can then be compiled and linked with GCC and the
+@code{-fplugin} flag:
+
+@example
+$ gcc hello-starpu.c \
+    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
+    `pkg-config starpu-1.0 --libs`
+@end example
+
+As can be seen above, basic use the C extensions allows programmers to
+use StarPU tasks while essentially annotating ``regular'' C code.
+
+@node Hello World using StarPU's API
+@subsection Hello World using StarPU's API
+
+The remainder of this section shows how to achieve the same result using
+StarPU's standard C API.
+
+@menu
+* Required Headers::            
+* Defining a Codelet::          
+* Submitting a Task::           
+* Execution of Hello World::    
+@end menu
+
+@node Required Headers
+@subsubsection Required Headers
+
+The @code{starpu.h} header should be included in any code using StarPU.
+
+@cartouche
+@smallexample
+#include <starpu.h>
+@end smallexample
+@end cartouche
+
+
+@node Defining a Codelet
+@subsubsection Defining a Codelet
+
+@cartouche
+@smallexample
+struct params @{
+    int i;
+    float f;
+@};
+void cpu_func(void *buffers[], void *cl_arg)
+@{
+    struct params *params = cl_arg;
+
+    printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);
+@}
+
+struct starpu_codelet cl =
+@{
+    .where = STARPU_CPU,
+    .cpu_funcs = @{ cpu_func, NULL @},
+    .nbuffers = 0
+@};
+@end smallexample
+@end cartouche
+
+A codelet is a structure that represents a computational kernel. Such a codelet
+may contain an implementation of the same kernel on different architectures
+(e.g. CUDA, Cell's SPU, x86, ...).
+
+The @code{nbuffers} field specifies the number of data buffers that are
+manipulated by the codelet: here the codelet does not access or modify any data
+that is controlled by our data management library. Note that the argument
+passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
+structure) does not count as a buffer since it is not managed by our data
+management library, but just contain trivial parameters.
+
+@c TODO need a crossref to the proper description of "where" see bla for more ...
+We create a codelet which may only be executed on the CPUs. The @code{where}
+field is a bitmask that defines where the codelet may be executed. Here, the
+@code{STARPU_CPU} value means that only CPUs can execute this codelet
+(@pxref{Codelets and Tasks} for more details on this field). Note that
+the @code{where} field is optional, when unset its value is
+automatically set based on the availability of the different
+@code{XXX_funcs} fields.
+When a CPU core executes a codelet, it calls the @code{cpu_func} function,
+which @emph{must} have the following prototype:
+
+@code{void (*cpu_func)(void *buffers[], void *cl_arg);}
+
+In this example, we can ignore the first argument of this function which gives a
+description of the input and output buffers (e.g. the size and the location of
+the matrices) since there is none.
+The second argument is a pointer to a buffer passed as an
+argument to the codelet by the means of the @code{cl_arg} field of the
+@code{starpu_task} structure.
+
+@c TODO rewrite so that it is a little clearer ?
+Be aware that this may be a pointer to a
+@emph{copy} of the actual buffer, and not the pointer given by the programmer:
+if the codelet modifies this buffer, there is no guarantee that the initial
+buffer will be modified as well: this for instance implies that the buffer
+cannot be used as a synchronization medium. If synchronization is needed, data
+has to be registered to StarPU, see @ref{Vector Scaling Using StarPu's API}.
+
+@node Submitting a Task
+@subsubsection Submitting a Task
+
+@cartouche
+@smallexample
+void callback_func(void *callback_arg)
+@{
+    printf("Callback function (arg %x)\n", callback_arg);
+@}
+
+int main(int argc, char **argv)
+@{
+    /* @b{initialize StarPU} */
+    starpu_init(NULL);
+
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &cl; /* @b{Pointer to the codelet defined above} */
+
+    struct params params = @{ 1, 2.0f @};
+    task->cl_arg = &params;
+    task->cl_arg_size = sizeof(params);
+
+    task->callback_func = callback_func;
+    task->callback_arg = 0x42;
+
+    /* @b{starpu_task_submit will be a blocking call} */
+    task->synchronous = 1;
+
+    /* @b{submit the task to StarPU} */
+    starpu_task_submit(task);
+
+    /* @b{terminate StarPU} */
+    starpu_shutdown();
+
+    return 0;
+@}
+@end smallexample
+@end cartouche
+
+Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
+@code{NULL} argument specifies that we use default configuration. Tasks cannot
+be submitted after the termination of StarPU by a call to
+@code{starpu_shutdown}.
+
+In the example above, a task structure is allocated by a call to
+@code{starpu_task_create}. This function only allocates and fills the
+corresponding structure with the default settings (@pxref{Codelets and
+Tasks, starpu_task_create}), but it does not submit the task to StarPU.
+
+@c not really clear ;)
+The @code{cl} field is a pointer to the codelet which the task will
+execute: in other words, the codelet structure describes which computational
+kernel should be offloaded on the different architectures, and the task
+structure is a wrapper containing a codelet and the piece of data on which the
+codelet should operate.
+
+The optional @code{cl_arg} field is a pointer to a buffer (of size
+@code{cl_arg_size}) with some parameters for the kernel
+described by the codelet. For instance, if a codelet implements a computational
+kernel that multiplies its input vector by a constant, the constant could be
+specified by the means of this buffer, instead of registering it as a StarPU
+data. It must however be noted that StarPU avoids making copy whenever possible
+and rather passes the pointer as such, so the buffer which is pointed at must
+kept allocated until the task terminates, and if several tasks are submitted
+with various parameters, each of them must be given a pointer to their own
+buffer.
+
+Once a task has been executed, an optional callback function is be called.
+While the computational kernel could be offloaded on various architectures, the
+callback function is always executed on a CPU. The @code{callback_arg}
+pointer is passed as an argument of the callback. The prototype of a callback
+function must be:
+
+@code{void (*callback_function)(void *);}
+
+If the @code{synchronous} field is non-zero, task submission will be
+synchronous: the @code{starpu_task_submit} function will not return until the
+task was executed. Note that the @code{starpu_shutdown} method does not
+guarantee that asynchronous tasks have been executed before it returns,
+@code{starpu_task_wait_for_all} can be used to that effect, or data can be
+unregistered (@code{starpu_data_unregister(vector_handle);}), which will
+implicitly wait for all the tasks scheduled to work on it, unless explicitly
+disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
+@code{starpu_data_set_sequential_consistency_flag}.
+
+@node Execution of Hello World
+@subsubsection Execution of Hello World
+
+@smallexample
+% make hello_world
+cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0) hello_world.c -o hello_world
+% ./hello_world
+Hello world (params = @{1, 2.000000@} )
+Callback function (arg 42)
+@end smallexample
+
+@node Vector Scaling Using the C Extension
+@section Vector Scaling Using the C Extension
+
+The previous example has shown how to submit tasks. In this section,
+we show how StarPU tasks can manipulate data. The version of this
+example using StarPU's API is given in the next sections.
+
+
+@menu
+* Adding an OpenCL Task Implementation::  
+* Adding a CUDA Task Implementation::  
+@end menu
+
+The simplest way to get started writing StarPU programs is using the C
+language extensions provided by the GCC plug-in (@pxref{C Extensions}).
+These extensions map directly to StarPU's main concepts: tasks, task
+implementations for CPU, OpenCL, or CUDA, and registered data buffers.
+
+The example below is a vector-scaling program, that multiplies elements
+of a vector by a given factor@footnote{The complete example, and
+additional examples, is available in the @file{gcc-plugin/examples}
+directory of the StarPU distribution.}.  For comparison, the standard C
+version that uses StarPU's standard C programming interface is given in
+the next section (@pxref{Vector Scaling Using StarPu's API, standard C
+version of the example}).
+
+First of all, the vector-scaling task and its simple CPU implementation
+has to be defined:
+
+@cartouche
+@smallexample
+/* Declare the `vector_scal' task.  */
+
+static void vector_scal (size_t size, float vector[size],
+                         float factor)
+  __attribute__ ((task));
+
+/* Declare and define the standard CPU implementation.  */
+
+static void vector_scal_cpu (size_t size, float vector[size],
+                             float factor)
+  __attribute__ ((task_implementation ("cpu", vector_scal)));
+
+static void
+vector_scal_cpu (size_t size, float vector[size], float factor)
+@{
+  size_t i;
+  for (i = 0; i < size; i++)
+    vector[i] *= factor;
+@}
+@end smallexample
+@end cartouche
+
+Next, the body of the program, which uses the task defined above, can be
+implemented:
+
+@cartouche
+@smallexample
+int
+main (void)
+@{
+#pragma starpu initialize
+
+#define NX     0x100000
+#define FACTOR 3.14
+
+  @{
+    float vector[NX] __attribute__ ((heap_allocated));
+
+#pragma starpu register vector
+
+    size_t i;
+    for (i = 0; i < NX; i++)
+      vector[i] = (float) i;
+
+    vector_scal (NX, vector, FACTOR);
+
+#pragma starpu wait
+  @} /* VECTOR is automatically freed here.  */
+
+#pragma starpu shutdown
+
+  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
+@}
+@end smallexample
+@end cartouche
+
+@noindent
+The @code{main} function above does several things:
+
+@itemize
+@item
+It initializes StarPU.
+
+@item
+It allocates @var{vector} in the heap; it will automatically be freed
+when its scope is left.  Alternatively, good old @code{malloc} and
+@code{free} could have been used, but they are more error-prone and
+require more typing.
+
+@item
+It @dfn{registers} the memory pointed to by @var{vector}.  Eventually,
+when OpenCL or CUDA task implementations are added, this will allow
+StarPU to transfer that memory region between GPUs and the main memory.
+Removing this @code{pragma} is an error.
+
+@item
+It invokes the @code{vector_scal} task.  The invocation looks the same
+as a standard C function call.  However, it is an @dfn{asynchronous
+invocation}, meaning that the actual call is performed in parallel with
+the caller's continuation.
+
+@item
+It @dfn{waits} for the termination of the @code{vector_scal}
+asynchronous call.
+
+@item
+Finally, StarPU is shut down.
+
+@end itemize
+
+The program can be compiled and linked with GCC and the @code{-fplugin}
+flag:
+
+@example
+$ gcc hello-starpu.c \
+    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
+    `pkg-config starpu-1.0 --libs`
+@end example
+
+And voil@`a!
+
+@node Adding an OpenCL Task Implementation
+@subsection Adding an OpenCL Task Implementation
+
+Now, this is all fine and great, but you certainly want to take
+advantage of these newfangled GPUs that your lab just bought, don't you?
+
+So, let's add an OpenCL implementation of the @code{vector_scal} task.
+We assume that the OpenCL kernel is available in a file,
+@file{vector_scal_opencl_kernel.cl}, not shown here.  The OpenCL task
+implementation is similar to that used with the standard C API
+(@pxref{Definition of the OpenCL Kernel}).  It is declared and defined
+in our C file like this:
+
+@cartouche
+@smallexample
+/* Include StarPU's OpenCL integration.  */
+#include <starpu_opencl.h>
+
+/* The OpenCL programs, loaded from `main' (see below).  */
+static struct starpu_opencl_program cl_programs;
+
+static void vector_scal_opencl (size_t size, float vector[size],
+                                float factor)
+  __attribute__ ((task_implementation ("opencl", vector_scal)));
+
+@c TODO This example will not work : size cannot be a size_t in clSetKernelArg, and global should not be 1. Do we want to document the ugly hach we use, though ?
+static void
+vector_scal_opencl (size_t size, float vector[size], float factor)
+@{
+  int id, devid, err;
+  cl_kernel kernel;
+  cl_command_queue queue;
+  cl_event event;
+
+  /* VECTOR is GPU memory pointer, not a main memory pointer.  */
+  cl_mem val = (cl_mem) vector;
+
+  id = starpu_worker_get_id ();
+  devid = starpu_worker_get_devid (id);
+
+  /* Prepare to invoke the kernel.  In the future, this will be largely
+     automated.  */
+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
+                                   "vector_mult_opencl", devid);
+  if (err != CL_SUCCESS)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  err = clSetKernelArg (kernel, 0, sizeof (val), &val);
+  err |= clSetKernelArg (kernel, 1, sizeof (size), &size);
+  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
+  if (err)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  size_t global = 1, local = 1;
+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global,
+                                &local, 0, NULL, &event);
+  if (err != CL_SUCCESS)
+    STARPU_OPENCL_REPORT_ERROR (err);
+
+  clFinish (queue);
+  starpu_opencl_collect_stats (event);
+  clReleaseEvent (event);
+
+  /* Done with KERNEL.  */
+  starpu_opencl_release_kernel (kernel);
+@}
+@end smallexample
+@end cartouche
+
+@noindent
+The OpenCL kernel itself must be loaded from @code{main}, sometime after
+the @code{initialize} pragma:
+
+@cartouche
+@smallexample
+  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
+                                       &cl_programs, "");
+@end smallexample
+@end cartouche
+
+@noindent
+And that's it.  The @code{vector_scal} task now has an additional
+implementation, for OpenCL, which StarPU's scheduler may choose to use
+at run-time.  Unfortunately, the @code{vector_scal_opencl} above still
+has to go through the common OpenCL boilerplate; in the future,
+additional extensions will automate most of it.
+
+@node Adding a CUDA Task Implementation
+@subsection Adding a CUDA Task Implementation
+
+Adding a CUDA implementation of the task is very similar, except that
+the implementation itself is typically written in CUDA, and compiled
+with @code{nvcc}.  Thus, the C file only needs to contain an external
+declaration for the task implementation:
+
+@cartouche
+@smallexample
+extern void vector_scal_cuda (size_t size, float vector[size],
+                              float factor)
+  __attribute__ ((task_implementation ("cuda", vector_scal)));
+@end smallexample
+@end cartouche
+
+The actual implementation of the CUDA task goes into a separate
+compilation unit, in a @file{.cu} file.  It is very close to the
+implementation when using StarPU's standard C API (@pxref{Definition of
+the CUDA Kernel}).
+
+@cartouche
+@smallexample
+/* CUDA implementation of the `vector_scal' task, to be compiled
+   with `nvcc'.  */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <stdlib.h>
+
+static __global__ void
+vector_mult_cuda (float *val, unsigned n, float factor)
+@{
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < n)
+    val[i] *= factor;
+@}
+
+/* Definition of the task implementation declared in the C file.   */
+extern "C" void
+vector_scal_cuda (size_t size, float vector[], float factor)
+@{
+  unsigned threads_per_block = 64;
+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
+
+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
+    starpu_cuda_get_local_stream () >>> (vector, size, factor);
+
+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
+@}
+@end smallexample
+@end cartouche
+
+The complete source code, in the @file{gcc-plugin/examples/vector_scal}
+directory of the StarPU distribution, also shows how an SSE-specialized
+CPU task implementation can be added.
+
+For more details on the C extensions provided by StarPU's GCC plug-in,
+@xref{C Extensions}.
+
+@node Vector Scaling Using StarPu's API
+@section Vector Scaling Using StarPu's API
+
+This section shows how to achieve the same result as explained in the
+previous section using StarPU's standard C API.
+
+The full source code for
+this example is given in @ref{Full source code for the 'Scaling a
+Vector' example}.
+
+@menu
+* Source Code of Vector Scaling::  
+* Execution of Vector Scaling::  Running the program
+@end menu
+
+@node Source Code of Vector Scaling
+@subsection Source Code of Vector Scaling
+
+Programmers can describe the data layout of their application so that StarPU is
+responsible for enforcing data coherency and availability across the machine.
+Instead of handling complex (and non-portable) mechanisms to perform data
+movements, programmers only declare which piece of data is accessed and/or
+modified by a task, and StarPU makes sure that when a computational kernel
+starts somewhere (e.g. on a GPU), its data are available locally.
+
+Before submitting those tasks, the programmer first needs to declare the
+different pieces of data to StarPU using the @code{starpu_*_data_register}
+functions. To ease the development of applications for StarPU, it is possible
+to describe multiple types of data layout. A type of data layout is called an
+@b{interface}. There are different predefined interfaces available in StarPU:
+here we will consider the @b{vector interface}.
+
+The following lines show how to declare an array of @code{NX} elements of type
+@code{float} using the vector interface:
+
+@cartouche
+@smallexample
+float vector[NX];
+
+starpu_data_handle_t vector_handle;
+starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
+                            sizeof(vector[0]));
+@end smallexample
+@end cartouche
+
+The first argument, called the @b{data handle}, is an opaque pointer which
+designates the array in StarPU. This is also the structure which is used to
+describe which data is used by a task. The second argument is the node number
+where the data originally resides. Here it is 0 since the @code{vector} array is in
+the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
+the number of elements in the vector and the size of each element.
+The following shows how to construct a StarPU task that will manipulate the
+vector and a constant factor.
+
+@cartouche
+@smallexample
+float factor = 3.14;
+struct starpu_task *task = starpu_task_create();
+
+task->cl = &cl;                      /* @b{Pointer to the codelet defined below} */
+task->handles[0] = vector_handle;    /* @b{First parameter of the codelet} */
+task->cl_arg = &factor;
+task->cl_arg_size = sizeof(factor);
+task->synchronous = 1;
+
+starpu_task_submit(task);
+@end smallexample
+@end cartouche
+
+Since the factor is a mere constant float value parameter,
+it does not need a preliminary registration, and
+can just be passed through the @code{cl_arg} pointer like in the previous
+example.  The vector parameter is described by its handle.
+There are two fields in each element of the @code{buffers} array.
+@code{handle} is the handle of the data, and @code{mode} specifies how the
+kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
+write-only and @code{STARPU_RW} for read and write access).
+
+The definition of the codelet can be written as follows:
+
+@cartouche
+@smallexample
+void scal_cpu_func(void *buffers[], void *cl_arg)
+@{
+    unsigned i;
+    float *factor = cl_arg;
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    /* CPU copy of the vector pointer */
+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+
+    for (i = 0; i < n; i++)
+        val[i] *= *factor;
+@}
+
+struct starpu_codelet cl = @{
+    .where = STARPU_CPU,
+    .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@};
+@end smallexample
+@end cartouche
+
+The first argument is an array that gives
+a description of all the buffers passed in the @code{task->handles}@ array. The
+size of this array is given by the @code{nbuffers} field of the codelet
+structure. For the sake of genericity, this array contains pointers to the
+different interfaces describing each buffer.  In the case of the @b{vector
+interface}, the location of the vector (resp. its length) is accessible in the
+@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
+read-write fashion, any modification will automatically affect future accesses
+to this vector made by other tasks.
+
+The second argument of the @code{scal_cpu_func} function contains a pointer to the
+parameters of the codelet (given in @code{task->cl_arg}), so that we read the
+constant factor from this pointer.
+
+@node Execution of Vector Scaling
+@subsection Execution of Vector Scaling
+
+@smallexample
+% make vector_scal
+cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0)  vector_scal.c   -o vector_scal
+% ./vector_scal
+0.000000 3.000000 6.000000 9.000000 12.000000
+@end smallexample
+
+@node Vector Scaling on an Hybrid CPU/GPU Machine
+@section Vector Scaling on an Hybrid CPU/GPU Machine
+
+Contrary to the previous examples, the task submitted in this example may not
+only be executed by the CPUs, but also by a CUDA device.
+
+@menu
+* Definition of the CUDA Kernel::  
+* Definition of the OpenCL Kernel::  
+* Definition of the Main Code::  
+* Execution of Hybrid Vector Scaling::  
+@end menu
+
+@node Definition of the CUDA Kernel
+@subsection Definition of the CUDA Kernel
+
+The CUDA implementation can be written as follows. It needs to be compiled with
+a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
+that the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPU
+memory, so that it can be passed as such to the @code{vector_mult_cuda} kernel
+call.
+
+@cartouche
+@smallexample
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+static __global__ void vector_mult_cuda(float *val, unsigned n,
+                                        float factor)
+@{
+    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+    if (i < n)
+        val[i] *= factor;
+@}
+
+extern "C" void scal_cuda_func(void *buffers[], void *_args)
+@{
+    float *factor = (float *)_args;
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    /* CUDA copy of the vector pointer */
+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
+    unsigned threads_per_block = 64;
+    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
+
+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
+@}
+@end smallexample
+@end cartouche
+
+@node Definition of the OpenCL Kernel
+@subsection Definition of the OpenCL Kernel
+
+The OpenCL implementation can be written as follows. StarPU provides
+tools to compile a OpenCL kernel stored in a file.
+
+@cartouche
+@smallexample
+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+@{
+        const int i = get_global_id(0);
+        if (i < nx) @{
+                val[i] *= factor;
+        @}
+@}
+@end smallexample
+@end cartouche
+
+Contrary to CUDA and CPU, @code{STARPU_VECTOR_GET_DEV_HANDLE} has to be used,
+which returns a @code{cl_mem} (which is not a device pointer, but an OpenCL
+handle), which can be passed as such to the OpenCL kernel. The difference is
+important when using partitioning, see @ref{Partitioning Data}.
+
+@cartouche
+@smallexample
+#include <starpu.h>
+@i{#include <starpu_opencl.h>}
+
+@i{extern struct starpu_opencl_program programs;}
+
+void scal_opencl_func(void *buffers[], void *_args)
+@{
+    float *factor = _args;
+@i{    int id, devid, err;}
+@i{    cl_kernel kernel;}
+@i{    cl_command_queue queue;}
+@i{    cl_event event;}
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    /* OpenCL copy of the vector pointer */
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+
+@i{    id = starpu_worker_get_id();}
+@i{    devid = starpu_worker_get_devid(id);}
+
+@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
+@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
+@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
+
+@i{    err = clSetKernelArg(kernel, 0, sizeof(val), &val);}
+@i{    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
+@i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
+@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
+
+@i{    @{}
+@i{        size_t global=n;}
+@i{        size_t local=1;}
+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
+@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
+@i{    @}}
+
+@i{    clFinish(queue);}
+@i{    starpu_opencl_collect_stats(event);}
+@i{    clReleaseEvent(event);}
+
+@i{    starpu_opencl_release_kernel(kernel);}
+@}
+@end smallexample
+@end cartouche
+
+
+@node Definition of the Main Code
+@subsection Definition of the Main Code
+
+The CPU implementation is the same as in the previous section.
+
+Here is the source of the main application. You can notice the value of the
+field @code{where} for the codelet. We specify
+@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
+can be executed either on a CPU or on a CUDA or an OpenCL device.
+
+@cartouche
+@smallexample
+#include <starpu.h>
+
+#define NX 2048
+
+extern void scal_cuda_func(void *buffers[], void *_args);
+extern void scal_cpu_func(void *buffers[], void *_args);
+extern void scal_opencl_func(void *buffers[], void *_args);
+
+/* @b{Definition of the codelet} */
+static struct starpu_codelet cl = @{
+    .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
+                                     /* @b{on a CUDA device, or on an OpenCL device} */
+    .cuda_funcs = @{ scal_cuda_func, NULL @},
+    .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .opencl_funcs = @{ scal_opencl_func, NULL @},
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
+@}
+
+#ifdef STARPU_USE_OPENCL
+/* @b{The compiled version of the OpenCL program} */
+struct starpu_opencl_program programs;
+#endif
+
+int main(int argc, char **argv)
+@{
+    float *vector;
+    int i, ret;
+    float factor=3.0;
+    struct starpu_task *task;
+    starpu_data_handle_t vector_handle;
+
+    starpu_init(NULL);                            /* @b{Initialising StarPU} */
+
+#ifdef STARPU_USE_OPENCL
+    starpu_opencl_load_opencl_from_file(
+            "examples/basic_examples/vector_scal_opencl_codelet.cl",
+            &programs, NULL);
+#endif
+
+    vector = malloc(NX*sizeof(vector[0]));
+    assert(vector);
+    for(i=0 ; i<NX ; i++) vector[i] = i;
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+    /* @b{Registering data within StarPU} */
+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
+                                NX, sizeof(vector[0]));
+
+    /* @b{Definition of the task} */
+    task = starpu_task_create();
+    task->cl = &cl;
+    task->handles[0] = vector_handle;
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+    /* @b{Submitting the task} */
+    ret = starpu_task_submit(task);
+    if (ret == -ENODEV) @{
+            fprintf(stderr, "No worker may execute this task\n");
+            return 1;
+    @}
+
+@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
+    /* @b{Waiting for its termination} */
+    starpu_task_wait_for_all();
+
+    /* @b{Update the vector in RAM} */
+    starpu_data_acquire(vector_handle, STARPU_R);
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+    /* @b{Access the data} */
+    for(i=0 ; i<NX; i++) @{
+      fprintf(stderr, "%f ", vector[i]);
+    @}
+    fprintf(stderr, "\n");
+
+    /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */
+    starpu_data_release(vector_handle);
+    starpu_data_unregister(vector_handle);
+    starpu_shutdown();
+
+    return 0;
+@}
+@end smallexample
+@end cartouche
+
+@node Execution of Hybrid Vector Scaling
+@subsection Execution of Hybrid Vector Scaling
+
+The Makefile given at the beginning of the section must be extended to
+give the rules to compile the CUDA source code. Note that the source
+file of the OpenCL kernel does not need to be compiled now, it will
+be compiled at run-time when calling the function
+@code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).
+
+@cartouche
+@smallexample
+CFLAGS  += $(shell pkg-config --cflags starpu-1.0)
+LDFLAGS += $(shell pkg-config --libs starpu-1.0)
+CC       = gcc
+
+vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
+
+%.o: %.cu
+       nvcc $(CFLAGS) $< -c $@
+
+clean:
+       rm -f vector_scal *.o
+@end smallexample
+@end cartouche
+
+@smallexample
+% make
+@end smallexample
+
+and to execute it, with the default configuration:
+
+@smallexample
+% ./vector_scal
+0.000000 3.000000 6.000000 9.000000 12.000000
+@end smallexample
+
+or for example, by disabling CPU devices:
+
+@smallexample
+% STARPU_NCPUS=0 ./vector_scal
+0.000000 3.000000 6.000000 9.000000 12.000000
+@end smallexample
+
+or by disabling CUDA devices (which may permit to enable the use of OpenCL,
+see @ref{Enabling OpenCL}):
+
+@smallexample
+% STARPU_NCUDA=0 ./vector_scal
+0.000000 3.000000 6.000000 9.000000 12.000000
+@end smallexample

+ 411 - 0
doc/chapters/c-extensions.texi

@@ -0,0 +1,411 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@cindex C extensions
+@cindex GCC plug-in
+
+When GCC plug-in support is available, StarPU builds a plug-in for the
+GNU Compiler Collection (GCC), which defines extensions to languages of
+the C family (C, C++, Objective-C) that make it easier to write StarPU
+code@footnote{This feature is only available for GCC 4.5 and later.  It
+can be disabled by configuring with @code{--disable-gcc-extensions}.}.
+
+Those extensions include syntactic sugar for defining
+tasks and their implementations, invoking a task, and manipulating data
+buffers.  Use of these extensions can be made conditional on the
+availability of the plug-in, leading to valid C sequential code when the
+plug-in is not used (@pxref{Conditional Extensions}).
+
+When StarPU has been installed with its GCC plug-in, programs that use
+these extensions can be compiled this way:
+
+@example
+$ gcc -c -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` foo.c
+@end example
+
+@noindent
+When the plug-in is not available, the above @command{pkg-config}
+command returns the empty string.
+
+In addition, the @code{-fplugin-arg-starpu-verbose} flag can be used to
+obtain feedback from the compiler as it analyzes the C extensions used
+in source files.
+
+This section describes the C extensions implemented by StarPU's GCC
+plug-in.  It does not require detailed knowledge of the StarPU library.
+
+Note: as of StarPU @value{VERSION}, this is still an area under
+development and subject to change.
+
+@menu
+* Defining Tasks::              Defining StarPU tasks
+* Synchronization and Other Pragmas:: Synchronization, and more.
+* Registered Data Buffers::     Manipulating data buffers
+* Conditional Extensions::      Using C extensions only when available
+@end menu
+
+@node Defining Tasks
+@section Defining Tasks
+
+@cindex task
+@cindex task implementation
+
+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
+
+@enumerate
+@item
+tasks may have several implementations---e.g., one for CPUs, one written
+in OpenCL, one written in CUDA;
+@item
+tasks may have several implementations of the same target---e.g.,
+several CPU implementations;
+@item
+when a task is invoked, it may run in parallel, and StarPU is free to
+choose any of its implementations.
+@end enumerate
+
+Tasks and their implementations must be @emph{declared}.  These
+declarations are annotated with @dfn{attributes} (@pxref{Attribute
+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
+(GCC)}): the declaration of a task is a regular C function declaration
+with an additional @code{task} attribute, and task implementations are
+declared with a @code{task_implementation} attribute.
+
+The following function attributes are provided:
+
+@table @code
+
+@item task
+@cindex @code{task} attribute
+Declare the given function as a StarPU task.  Its return type must be
+@code{void}, and it must not be defined---instead, a definition will
+automatically be provided by the compiler.
+
+Under the hood, declaring a task leads to the declaration of the
+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
+more task implementations are declared in the same compilation unit,
+then the codelet and the function itself are also defined; they inherit
+the scope of the task.
+
+Scalar arguments to the task are passed by value and copied to the
+target device if need be---technically, they are passed as the
+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
+
+@cindex @code{output} type attribute
+Pointer arguments are assumed to be registered data buffers---the
+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
+buffers are assumed to be used read-write (@code{STARPU_RW}).  In
+addition, the @code{output} type attribute can be as a type qualifier
+for output pointer or array parameters (@code{STARPU_W}).
+
+@item task_implementation (@var{target}, @var{task})
+@cindex @code{task_implementation} attribute
+Declare the given function as an implementation of @var{task} to run on
+@var{target}.  @var{target} must be a string, currently one of
+@code{"cpu"}, @code{"opencl"}, or @code{"cuda"}.
+@c FIXME: Update when OpenCL support is ready.
+
+@end table
+
+Here is an example:
+
+@cartouche
+@smallexample
+#define __output  __attribute__ ((output))
+
+static void matmul (const float *A, const float *B,
+                    __output float *C,
+                    size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task));
+
+static void matmul_cpu (const float *A, const float *B,
+                        __output float *C,
+                        size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("cpu", matmul)));
+
+
+static void
+matmul_cpu (const float *A, const float *B, __output float *C,
+            size_t nx, size_t ny, size_t nz)
+@{
+  size_t i, j, k;
+
+  for (j = 0; j < ny; j++)
+    for (i = 0; i < nx; i++)
+      @{
+        for (k = 0; k < nz; k++)
+          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
+      @}
+@}
+@end smallexample
+@end cartouche
+
+@noindent
+A @code{matmult} task is defined; it has only one implementation,
+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
+@var{B} are input buffers, whereas @var{C} is considered an input/output
+buffer.
+
+CUDA and OpenCL implementations can be declared in a similar way:
+
+@cartouche
+@smallexample
+static void matmul_cuda (const float *A, const float *B, float *C,
+                         size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("cuda", matmul)));
+
+static void matmul_opencl (const float *A, const float *B, float *C,
+                           size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("opencl", matmul)));
+@end smallexample
+@end cartouche
+
+@noindent
+The CUDA and OpenCL implementations typically either invoke a kernel
+written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
+@pxref{OpenCL Kernel}), or call a library function that uses CUDA or
+OpenCL under the hood, such as CUBLAS functions:
+
+@cartouche
+@smallexample
+static void
+matmul_cuda (const float *A, const float *B, float *C,
+             size_t nx, size_t ny, size_t nz)
+@{
+  cublasSgemm ('n', 'n', nx, ny, nz,
+               1.0f, A, 0, B, 0,
+               0.0f, C, 0);
+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
+@}
+@end smallexample
+@end cartouche
+
+A task can be invoked like a regular C function:
+
+@cartouche
+@smallexample
+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
+        &B[k * xdim * bzdim + j * bxdim * bzdim],
+        &C[i * xdim * bydim + j * bxdim * bydim],
+        bxdim, bydim, bzdim);
+@end smallexample
+@end cartouche
+
+@noindent
+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
+implementation may run in parallel with the continuation of the caller.
+
+The next section describes how memory buffers must be handled in
+StarPU-GCC code.  For a complete example, see the
+@code{gcc-plugin/examples} directory of the source distribution, and
+@ref{Vector Scaling Using the C Extension, the vector-scaling
+example}.
+
+
+@node Synchronization and Other Pragmas
+@section Initialization, Termination, and Synchronization
+
+The following pragmas allow user code to control StarPU's life time and
+to synchronize with tasks.
+
+@table @code
+
+@item #pragma starpu initialize
+Initialize StarPU.  This call is compulsory and is @emph{never} added
+implicitly.  One of the reasons this has to be done explicitly is that
+it provides greater control to user code over its resource usage.
+
+@item #pragma starpu shutdown
+Shut down StarPU, giving it an opportunity to write profiling info to a
+file on disk, for instance (@pxref{Off-line, off-line performance
+feedback}).
+
+@item #pragma starpu wait
+Wait for all task invocations to complete, as with
+@code{starpu_wait_for_all} (@pxref{Codelets and Tasks,
+starpu_wait_for_all}).
+
+@end table
+
+@node Registered Data Buffers
+@section Registered Data Buffers
+
+Data buffers such as matrices and vectors that are to be passed to tasks
+must be @dfn{registered}.  Registration allows StarPU to handle data
+transfers among devices---e.g., transferring an input buffer from the
+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
+Management Library}).
+
+The following pragmas are provided:
+
+@table @code
+
+@item #pragma starpu register @var{ptr} [@var{size}]
+Register @var{ptr} as a @var{size}-element buffer.  When @var{ptr} has
+an array type whose size is known, @var{size} may be omitted.
+
+@item #pragma starpu unregister @var{ptr}
+Unregister the previously-registered memory area pointed to by
+@var{ptr}.  As a side-effect, @var{ptr} points to a valid copy in main
+memory.
+
+@item #pragma starpu acquire @var{ptr}
+Acquire in main memory an up-to-date copy of the previously-registered
+memory area pointed to by @var{ptr}, for read-write access.
+
+@item #pragma starpu release @var{ptr}
+Release the previously-register memory area pointed to by @var{ptr},
+making it available to the tasks.
+
+@end table
+
+Additionally, the @code{heap_allocated} variable attribute offers a
+simple way to allocate storage for arrays on the heap:
+
+@table @code
+
+@item heap_allocated
+@cindex @code{heap_allocated} attribute
+This attributes applies to local variables with an array type.  Its
+effect is to automatically allocate the array's storage on
+the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
+Library API, starpu_malloc}).  The heap-allocated array is automatically
+freed when the variable's scope is left, as with
+automatic variables.
+
+@end table
+
+@noindent
+The following example illustrates use of the @code{heap_allocated}
+attribute:
+
+@example
+extern void cholesky(unsigned nblocks, unsigned size,
+                    float mat[nblocks][nblocks][size])
+  __attribute__ ((task));
+
+int
+main (int argc, char *argv[])
+@{
+#pragma starpu initialize
+
+  /* ... */
+
+  int nblocks, size;
+  parse_args (&nblocks, &size);
+
+  /* Allocate an array of the required size on the heap,
+     and register it.  */
+
+  @{
+    float matrix[nblocks][nblocks][size]
+      __attribute__ ((heap_allocated));
+
+#pragma starpu register matrix
+
+    cholesky (nblocks, size, matrix);
+
+#pragma starpu wait
+#pragma starpu unregister matrix
+
+  @}   /* MATRIX is automatically freed here.  */
+
+#pragma starpu shutdown
+
+  return EXIT_SUCCESS;
+@}
+@end example
+
+@node Conditional Extensions
+@section Using C Extensions Conditionally
+
+The C extensions described in this chapter are only available when GCC
+and its StarPU plug-in are in use.  Yet, it is possible to make use of
+these extensions when they are available---leading to hybrid CPU/GPU
+code---and discard them when they are not available---leading to valid
+sequential code.
+
+To that end, the GCC plug-in defines a C preprocessor macro when it is
+being used:
+
+@defmac STARPU_GCC_PLUGIN
+Defined for code being compiled with the StarPU GCC plug-in.  When
+defined, this macro expands to an integer denoting the version of the
+supported C extensions.
+@end defmac
+
+The code below illustrates how to define a task and its implementations
+in a way that allows it to be compiled without the GCC plug-in:
+
+@smallexample
+/* The macros below abstract over the attributes specific to
+   StarPU-GCC and the name of the CPU implementation.  */
+#ifdef STARPU_GCC_PLUGIN
+# define __task  __attribute__ ((task))
+# define CPU_TASK_IMPL(task)  task ## _cpu
+#else
+# define __task
+# define CPU_TASK_IMPL(task)  task
+#endif
+
+#include <stdlib.h>
+
+static void matmul (const float *A, const float *B, float *C,
+                    size_t nx, size_t ny, size_t nz) __task;
+
+#ifdef STARPU_GCC_PLUGIN
+
+static void matmul_cpu (const float *A, const float *B, float *C,
+                        size_t nx, size_t ny, size_t nz)
+  __attribute__ ((task_implementation ("cpu", matmul)));
+
+#endif
+
+
+static void
+CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
+                        size_t nx, size_t ny, size_t nz)
+@{
+  /* Code of the CPU kernel here...  */
+@}
+
+int
+main (int argc, char *argv[])
+@{
+  /* The pragmas below are simply ignored when StarPU-GCC
+     is not used.  */
+#pragma starpu initialize
+
+  float A[123][42][7], B[123][42][7], C[123][42][7];
+
+#pragma starpu register A
+#pragma starpu register B
+#pragma starpu register C
+
+  /* When StarPU-GCC is used, the call below is asynchronous;
+     otherwise, it is synchronous.  */
+  matmul (A, B, C, 123, 42, 7);
+
+#pragma starpu wait
+#pragma starpu shutdown
+
+  return EXIT_SUCCESS;
+@}
+@end smallexample
+
+Note that attributes such as @code{task} are simply ignored by GCC when
+the StarPU plug-in is not loaded, so the @code{__task} macro could be
+omitted altogether.  However, @command{gcc -Wall} emits a warning for
+unknown attributes, which can be inconvenient, and other compilers may
+be unable to parse the attribute syntax.  Thus, using macros such as
+@code{__task} above is recommended.
+
+@c Local Variables:
+@c TeX-master: "../starpu.texi"
+@c ispell-local-dictionary: "american"
+@c End:

+ 403 - 0
doc/chapters/configuration.texi

@@ -0,0 +1,403 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Compilation configuration::   
+* Execution configuration through environment variables::  
+@end menu
+
+@node Compilation configuration
+@section Compilation configuration
+
+The following arguments can be given to the @code{configure} script.
+
+@menu
+* Common configuration::        
+* Configuring workers::         
+* Advanced configuration::      
+@end menu
+
+@node Common configuration
+@subsection Common configuration
+
+@table @code
+
+@item --enable-debug
+Enable debugging messages.
+
+@item --enable-fast
+Disable assertion checks, which saves computation time.
+
+@item --enable-verbose
+Increase the verbosity of the debugging messages.  This can be disabled
+at runtime by setting the environment variable @code{STARPU_SILENT} to
+any value.
+
+@smallexample
+% STARPU_SILENT=1 ./vector_scal
+@end smallexample
+
+@item --enable-coverage
+Enable flags for the @code{gcov} coverage tool.
+
+@end table
+
+@node Configuring workers
+@subsection Configuring workers
+
+@table @code
+
+@item --enable-maxcpus=@var{count}
+Use at most @var{count} CPU cores.  This information is then
+available as the @code{STARPU_MAXCPUS} macro.
+
+@item --disable-cpu
+Disable the use of CPUs of the machine. Only GPUs etc. will be used.
+
+@item --enable-maxcudadev=@var{count}
+Use at most @var{count} CUDA devices.  This information is then
+available as the @code{STARPU_MAXCUDADEVS} macro.
+
+@item --disable-cuda
+Disable the use of CUDA, even if a valid CUDA installation was detected.
+
+@item --with-cuda-dir=@var{prefix}
+Search for CUDA under @var{prefix}, which should notably contain
+@file{include/cuda.h}.
+
+@item --with-cuda-include-dir=@var{dir}
+Search for CUDA headers under @var{dir}, which should
+notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
+value given to @code{--with-cuda-dir}.
+
+@item --with-cuda-lib-dir=@var{dir}
+Search for CUDA libraries under @var{dir}, which should notably contain
+the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
+@code{/lib} appended to the value given to @code{--with-cuda-dir}.
+
+@item --disable-cuda-memcpy-peer
+Explicitly disable peer transfers when using CUDA 4.0.
+
+@item --enable-maxopencldev=@var{count}
+Use at most @var{count} OpenCL devices.  This information is then
+available as the @code{STARPU_MAXOPENCLDEVS} macro.
+
+@item --disable-opencl
+Disable the use of OpenCL, even if the SDK is detected.
+
+@item --with-opencl-dir=@var{prefix}
+Search for an OpenCL implementation under @var{prefix}, which should
+notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
+Mac OS).
+
+@item --with-opencl-include-dir=@var{dir}
+Search for OpenCL headers under @var{dir}, which should notably contain
+@file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
+@code{/include} appended to the value given to @code{--with-opencl-dir}.
+
+@item --with-opencl-lib-dir=@var{dir}
+Search for an OpenCL library under @var{dir}, which should notably
+contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
+@code{/lib} appended to the value given to @code{--with-opencl-dir}.
+
+@item --enable-gordon
+Enable the use of the Gordon runtime for Cell SPUs.
+@c TODO: rather default to enabled when detected
+
+@item --with-gordon-dir=@var{prefix}
+Search for the Gordon SDK under @var{prefix}.
+
+@item --enable-maximplementations=@var{count}
+Allow for at most @var{count} codelet implementations for the same
+target device.  This information is then available as the
+@code{STARPU_MAXIMPLEMENTATIONS} macro.
+
+@end table
+
+@node Advanced configuration
+@subsection Advanced configuration
+
+@table @code
+
+@item --enable-perf-debug
+Enable performance debugging through gprof.
+
+@item --enable-model-debug
+Enable performance model debugging.
+
+@item --enable-stats
+@c see ../../src/datawizard/datastats.c
+Enable gathering of memory transfer statistics.
+
+@item --enable-maxbuffers
+Define the maximum number of buffers that tasks will be able to take
+as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
+
+@item --enable-allocation-cache
+Enable the use of a data allocation cache to avoid the cost of it with
+CUDA. Still experimental.
+
+@item --enable-opengl-render
+Enable the use of OpenGL for the rendering of some examples.
+@c TODO: rather default to enabled when detected
+
+@item --enable-blas-lib
+Specify the blas library to be used by some of the examples. The
+library has to be 'atlas' or 'goto'.
+
+@item --disable-starpufft
+Disable the build of libstarpufft, even if fftw or cuFFT is available.
+
+@item --with-magma=@var{prefix}
+Search for MAGMA under @var{prefix}.  @var{prefix} should notably
+contain @file{include/magmablas.h}.
+
+@item --with-fxt=@var{prefix}
+Search for FxT under @var{prefix}.
+@url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
+traces of scheduling events, which can then be rendered them using ViTE
+(@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
+notably contain @code{include/fxt/fxt.h}.
+
+@item --with-perf-model-dir=@var{dir}
+Store performance models under @var{dir}, instead of the current user's
+home.
+
+@item --with-mpicc=@var{path}
+Use the @command{mpicc} compiler at @var{path}, for starpumpi
+(@pxref{StarPU MPI support}).
+
+@item --with-goto-dir=@var{prefix}
+Search for GotoBLAS under @var{prefix}.
+
+@item --with-atlas-dir=@var{prefix}
+Search for ATLAS under @var{prefix}, which should notably contain
+@file{include/cblas.h}.
+
+@item --with-mkl-cflags=@var{cflags}
+Use @var{cflags} to compile code that uses the MKL library.
+
+@item --with-mkl-ldflags=@var{ldflags}
+Use @var{ldflags} when linking code that uses the MKL library.  Note
+that the
+@url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
+MKL website} provides a script to determine the linking flags.
+
+@item --disable-gcc-extensions
+Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
+enabled when the GCC compiler provides a plug-in support.
+
+@item --disable-socl
+Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
+default, it is enabled when an OpenCL implementation is found.
+
+@item --disable-starpu-top
+Disable the StarPU-Top interface (@pxref{starpu-top}).  By default, it
+is enabled when the required dependencies are found.
+
+@end table
+@node Execution configuration through environment variables
+@section Execution configuration through environment variables
+
+@menu
+* Workers::                     Configuring workers
+* Scheduling::                  Configuring the Scheduling engine
+* Misc::                        Miscellaneous and debug
+@end menu
+
+Note: the values given in @code{starpu_conf} structure passed when
+calling @code{starpu_init} will override the values of the environment
+variables.
+
+@node Workers
+@subsection Configuring workers
+
+@menu
+* STARPU_NCPUS::                Number of CPU workers
+* STARPU_NCUDA::                Number of CUDA workers
+* STARPU_NOPENCL::              Number of OpenCL workers
+* STARPU_NGORDON::              Number of SPU workers (Cell)
+* STARPU_WORKERS_NOBIND::       Do not bind workers
+* STARPU_WORKERS_CPUID::        Bind workers to specific CPUs
+* STARPU_WORKERS_CUDAID::       Select specific CUDA devices
+* STARPU_WORKERS_OPENCLID::     Select specific OpenCL devices
+@end menu
+
+@node STARPU_NCPUS
+@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
+
+Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
+more CPU workers than there are physical CPUs, and that some CPUs are used to control
+the accelerators.
+
+@node STARPU_NCUDA
+@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
+
+Specify the number of CUDA devices that StarPU can use. If
+@code{STARPU_NCUDA} is lower than the number of physical devices, it is
+possible to select which CUDA devices should be used by the means of the
+@code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
+create as many CUDA workers as there are CUDA devices.
+
+@node STARPU_NOPENCL
+@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
+
+OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
+
+@node STARPU_NGORDON
+@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
+
+Specify the number of SPUs that StarPU can use.
+
+@node STARPU_WORKERS_NOBIND
+@subsubsection @code{STARPU_WORKERS_NOBIND} -- Do not bind workers to specific CPUs
+
+Setting it to non-zero will prevent StarPU from binding its threads to
+CPUs. This is for instance useful when running the testsuite in parallel.
+
+@node STARPU_WORKERS_CPUID
+@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
+
+Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
+specifies on which logical CPU the different workers should be
+bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
+worker will be bound to logical CPU #0, the second CPU worker will be bound to
+logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
+determined by the OS, or provided by the @code{hwloc} library in case it is
+available.
+
+Note that the first workers correspond to the CUDA workers, then come the
+OpenCL and the SPU, and finally the CPU workers. For example if
+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
+and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
+by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
+the logical CPUs #1 and #3 will be used by the CPU workers.
+
+If the number of workers is larger than the array given in
+@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
+round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
+third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
+
+This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
+@code{starpu_conf} structure passed to @code{starpu_init} is set.
+
+@node STARPU_WORKERS_CUDAID
+@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
+
+Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
+possible to select which CUDA devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
+@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
+the one reported by CUDA).
+
+This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+
+@node STARPU_WORKERS_OPENCLID
+@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
+
+OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
+
+This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
+
+@node Scheduling
+@subsection Configuring the Scheduling engine
+
+@menu
+* STARPU_SCHED::                Scheduling policy
+* STARPU_CALIBRATE::            Calibrate performance models
+* STARPU_PREFETCH::             Use data prefetch
+* STARPU_SCHED_ALPHA::          Computation factor
+* STARPU_SCHED_BETA::           Communication factor
+@end menu
+
+@node STARPU_SCHED
+@subsubsection @code{STARPU_SCHED} -- Scheduling policy
+
+Choose between the different scheduling policies proposed by StarPU: work
+random, stealing, greedy, with performance models, etc.
+
+Use @code{STARPU_SCHED=help} to get the list of available schedulers.
+
+@node STARPU_CALIBRATE
+@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
+
+If this variable is set to 1, the performance models are calibrated during
+the execution. If it is set to 2, the previous values are dropped to restart
+calibration from scratch. Setting this variable to 0 disable calibration, this
+is the default behaviour.
+
+Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
+
+@node STARPU_PREFETCH
+@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
+
+This variable indicates whether data prefetching should be enabled (0 means
+that it is disabled). If prefetching is enabled, when a task is scheduled to be
+executed e.g. on a GPU, StarPU will request an asynchronous transfer in
+advance, so that data is already present on the GPU when the task starts. As a
+result, computation and data transfers are overlapped.
+Note that prefetching is enabled by default in StarPU.
+
+@node STARPU_SCHED_ALPHA
+@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
+
+To estimate the cost of a task StarPU takes into account the estimated
+computation time (obtained thanks to performance models). The alpha factor is
+the coefficient to be applied to it before adding it to the communication part.
+
+@node STARPU_SCHED_BETA
+@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
+
+To estimate the cost of a task StarPU takes into account the estimated
+data transfer time (obtained thanks to performance models). The beta factor is
+the coefficient to be applied to it before adding it to the computation part.
+
+@node Misc
+@subsection Miscellaneous and debug
+
+@menu
+* STARPU_SILENT::               Disable verbose mode
+* STARPU_LOGFILENAME::          Select debug file name
+* STARPU_FXT_PREFIX::           FxT trace location
+* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
+* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
+@end menu
+
+@node STARPU_SILENT
+@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
+
+This variable allows to disable verbose mode at runtime when StarPU
+has been configured with the option @code{--enable-verbose}.
+
+@node STARPU_LOGFILENAME
+@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
+
+This variable specifies in which file the debugging output should be saved to.
+
+@node STARPU_FXT_PREFIX
+@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
+
+This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
+
+@node STARPU_LIMIT_GPU_MEM
+@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
+
+This variable specifies the maximum number of megabytes that should be
+available to the application on each GPUs. In case this value is smaller than
+the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
+on the device. This variable is intended to be used for experimental purposes
+as it emulates devices that have a limited amount of memory.
+
+@node STARPU_GENERATE_TRACE
+@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
+
+When set to 1, this variable indicates that StarPU should automatically
+generate a Paje trace when starpu_shutdown is called.

+ 508 - 0
doc/chapters/fdl-1.3.texi

@@ -0,0 +1,508 @@
+@c -*-texinfo-*-
+
+@c The GNU Free Documentation License.
+@center Version 1.3, 3 November 2008
+
+@c This file is intended to be included within another document,
+@c hence no sectioning command or @node.
+
+@display
+Copyright @copyright{} 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
+@uref{http://fsf.org/}
+
+Everyone is permitted to copy and distribute verbatim copies
+of this license document, but changing it is not allowed.
+@end display
+
+@enumerate 0
+@item
+PREAMBLE
+
+The purpose of this License is to make a manual, textbook, or other
+functional and useful document @dfn{free} in the sense of freedom: to
+assure everyone the effective freedom to copy and redistribute it,
+with or without modifying it, either commercially or noncommercially.
+Secondarily, this License preserves for the author and publisher a way
+to get credit for their work, while not being considered responsible
+for modifications made by others.
+
+This License is a kind of ``copyleft'', which means that derivative
+works of the document must themselves be free in the same sense.  It
+complements the GNU General Public License, which is a copyleft
+license designed for free software.
+
+We have designed this License in order to use it for manuals for free
+software, because free software needs free documentation: a free
+program should come with manuals providing the same freedoms that the
+software does.  But this License is not limited to software manuals;
+it can be used for any textual work, regardless of subject matter or
+whether it is published as a printed book.  We recommend this License
+principally for works whose purpose is instruction or reference.
+
+@item
+APPLICABILITY AND DEFINITIONS
+
+This License applies to any manual or other work, in any medium, that
+contains a notice placed by the copyright holder saying it can be
+distributed under the terms of this License.  Such a notice grants a
+world-wide, royalty-free license, unlimited in duration, to use that
+work under the conditions stated herein.  The ``Document'', below,
+refers to any such manual or work.  Any member of the public is a
+licensee, and is addressed as ``you''.  You accept the license if you
+copy, modify or distribute the work in a way requiring permission
+under copyright law.
+
+A ``Modified Version'' of the Document means any work containing the
+Document or a portion of it, either copied verbatim, or with
+modifications and/or translated into another language.
+
+A ``Secondary Section'' is a named appendix or a front-matter section
+of the Document that deals exclusively with the relationship of the
+publishers or authors of the Document to the Document's overall
+subject (or to related matters) and contains nothing that could fall
+directly within that overall subject.  (Thus, if the Document is in
+part a textbook of mathematics, a Secondary Section may not explain
+any mathematics.)  The relationship could be a matter of historical
+connection with the subject or with related matters, or of legal,
+commercial, philosophical, ethical or political position regarding
+them.
+
+The ``Invariant Sections'' are certain Secondary Sections whose titles
+are designated, as being those of Invariant Sections, in the notice
+that says that the Document is released under this License.  If a
+section does not fit the above definition of Secondary then it is not
+allowed to be designated as Invariant.  The Document may contain zero
+Invariant Sections.  If the Document does not identify any Invariant
+Sections then there are none.
+
+The ``Cover Texts'' are certain short passages of text that are listed,
+as Front-Cover Texts or Back-Cover Texts, in the notice that says that
+the Document is released under this License.  A Front-Cover Text may
+be at most 5 words, and a Back-Cover Text may be at most 25 words.
+
+A ``Transparent'' copy of the Document means a machine-readable copy,
+represented in a format whose specification is available to the
+general public, that is suitable for revising the document
+straightforwardly with generic text editors or (for images composed of
+pixels) generic paint programs or (for drawings) some widely available
+drawing editor, and that is suitable for input to text formatters or
+for automatic translation to a variety of formats suitable for input
+to text formatters.  A copy made in an otherwise Transparent file
+format whose markup, or absence of markup, has been arranged to thwart
+or discourage subsequent modification by readers is not Transparent.
+An image format is not Transparent if used for any substantial amount
+of text.  A copy that is not ``Transparent'' is called ``Opaque''.
+
+Examples of suitable formats for Transparent copies include plain
+ASCII without markup, Texinfo input format, La@TeX{} input
+format, SGML or XML using a publicly available
+DTD, and standard-conforming simple HTML,
+PostScript or PDF designed for human modification.  Examples
+of transparent image formats include PNG, XCF and
+JPG.  Opaque formats include proprietary formats that can be
+read and edited only by proprietary word processors, SGML or
+XML for which the DTD and/or processing tools are
+not generally available, and the machine-generated HTML,
+PostScript or PDF produced by some word processors for
+output purposes only.
+
+The ``Title Page'' means, for a printed book, the title page itself,
+plus such following pages as are needed to hold, legibly, the material
+this License requires to appear in the title page.  For works in
+formats which do not have any title page as such, ``Title Page'' means
+the text near the most prominent appearance of the work's title,
+preceding the beginning of the body of the text.
+
+The ``publisher'' means any person or entity that distributes copies
+of the Document to the public.
+
+A section ``Entitled XYZ'' means a named subunit of the Document whose
+title either is precisely XYZ or contains XYZ in parentheses following
+text that translates XYZ in another language.  (Here XYZ stands for a
+specific section name mentioned below, such as ``Acknowledgements'',
+``Dedications'', ``Endorsements'', or ``History''.)  To ``Preserve the Title''
+of such a section when you modify the Document means that it remains a
+section ``Entitled XYZ'' according to this definition.
+
+The Document may include Warranty Disclaimers next to the notice which
+states that this License applies to the Document.  These Warranty
+Disclaimers are considered to be included by reference in this
+License, but only as regards disclaiming warranties: any other
+implication that these Warranty Disclaimers may have is void and has
+no effect on the meaning of this License.
+
+@item
+VERBATIM COPYING
+
+You may copy and distribute the Document in any medium, either
+commercially or noncommercially, provided that this License, the
+copyright notices, and the license notice saying this License applies
+to the Document are reproduced in all copies, and that you add no other
+conditions whatsoever to those of this License.  You may not use
+technical measures to obstruct or control the reading or further
+copying of the copies you make or distribute.  However, you may accept
+compensation in exchange for copies.  If you distribute a large enough
+number of copies you must also follow the conditions in section 3.
+
+You may also lend copies, under the same conditions stated above, and
+you may publicly display copies.
+
+@item
+COPYING IN QUANTITY
+
+If you publish printed copies (or copies in media that commonly have
+printed covers) of the Document, numbering more than 100, and the
+Document's license notice requires Cover Texts, you must enclose the
+copies in covers that carry, clearly and legibly, all these Cover
+Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
+the back cover.  Both covers must also clearly and legibly identify
+you as the publisher of these copies.  The front cover must present
+the full title with all words of the title equally prominent and
+visible.  You may add other material on the covers in addition.
+Copying with changes limited to the covers, as long as they preserve
+the title of the Document and satisfy these conditions, can be treated
+as verbatim copying in other respects.
+
+If the required texts for either cover are too voluminous to fit
+legibly, you should put the first ones listed (as many as fit
+reasonably) on the actual cover, and continue the rest onto adjacent
+pages.
+
+If you publish or distribute Opaque copies of the Document numbering
+more than 100, you must either include a machine-readable Transparent
+copy along with each Opaque copy, or state in or with each Opaque copy
+a computer-network location from which the general network-using
+public has access to download using public-standard network protocols
+a complete Transparent copy of the Document, free of added material.
+If you use the latter option, you must take reasonably prudent steps,
+when you begin distribution of Opaque copies in quantity, to ensure
+that this Transparent copy will remain thus accessible at the stated
+location until at least one year after the last time you distribute an
+Opaque copy (directly or through your agents or retailers) of that
+edition to the public.
+
+It is requested, but not required, that you contact the authors of the
+Document well before redistributing any large number of copies, to give
+them a chance to provide you with an updated version of the Document.
+
+@item
+MODIFICATIONS
+
+You may copy and distribute a Modified Version of the Document under
+the conditions of sections 2 and 3 above, provided that you release
+the Modified Version under precisely this License, with the Modified
+Version filling the role of the Document, thus licensing distribution
+and modification of the Modified Version to whoever possesses a copy
+of it.  In addition, you must do these things in the Modified Version:
+
+@enumerate A
+@item
+Use in the Title Page (and on the covers, if any) a title distinct
+from that of the Document, and from those of previous versions
+(which should, if there were any, be listed in the History section
+of the Document).  You may use the same title as a previous version
+if the original publisher of that version gives permission.
+
+@item
+List on the Title Page, as authors, one or more persons or entities
+responsible for authorship of the modifications in the Modified
+Version, together with at least five of the principal authors of the
+Document (all of its principal authors, if it has fewer than five),
+unless they release you from this requirement.
+
+@item
+State on the Title page the name of the publisher of the
+Modified Version, as the publisher.
+
+@item
+Preserve all the copyright notices of the Document.
+
+@item
+Add an appropriate copyright notice for your modifications
+adjacent to the other copyright notices.
+
+@item
+Include, immediately after the copyright notices, a license notice
+giving the public permission to use the Modified Version under the
+terms of this License, in the form shown in the Addendum below.
+
+@item
+Preserve in that license notice the full lists of Invariant Sections
+and required Cover Texts given in the Document's license notice.
+
+@item
+Include an unaltered copy of this License.
+
+@item
+Preserve the section Entitled ``History'', Preserve its Title, and add
+to it an item stating at least the title, year, new authors, and
+publisher of the Modified Version as given on the Title Page.  If
+there is no section Entitled ``History'' in the Document, create one
+stating the title, year, authors, and publisher of the Document as
+given on its Title Page, then add an item describing the Modified
+Version as stated in the previous sentence.
+
+@item
+Preserve the network location, if any, given in the Document for
+public access to a Transparent copy of the Document, and likewise
+the network locations given in the Document for previous versions
+it was based on.  These may be placed in the ``History'' section.
+You may omit a network location for a work that was published at
+least four years before the Document itself, or if the original
+publisher of the version it refers to gives permission.
+
+@item
+For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve
+the Title of the section, and preserve in the section all the
+substance and tone of each of the contributor acknowledgements and/or
+dedications given therein.
+
+@item
+Preserve all the Invariant Sections of the Document,
+unaltered in their text and in their titles.  Section numbers
+or the equivalent are not considered part of the section titles.
+
+@item
+Delete any section Entitled ``Endorsements''.  Such a section
+may not be included in the Modified Version.
+
+@item
+Do not retitle any existing section to be Entitled ``Endorsements'' or
+to conflict in title with any Invariant Section.
+
+@item
+Preserve any Warranty Disclaimers.
+@end enumerate
+
+If the Modified Version includes new front-matter sections or
+appendices that qualify as Secondary Sections and contain no material
+copied from the Document, you may at your option designate some or all
+of these sections as invariant.  To do this, add their titles to the
+list of Invariant Sections in the Modified Version's license notice.
+These titles must be distinct from any other section titles.
+
+You may add a section Entitled ``Endorsements'', provided it contains
+nothing but endorsements of your Modified Version by various
+parties---for example, statements of peer review or that the text has
+been approved by an organization as the authoritative definition of a
+standard.
+
+You may add a passage of up to five words as a Front-Cover Text, and a
+passage of up to 25 words as a Back-Cover Text, to the end of the list
+of Cover Texts in the Modified Version.  Only one passage of
+Front-Cover Text and one of Back-Cover Text may be added by (or
+through arrangements made by) any one entity.  If the Document already
+includes a cover text for the same cover, previously added by you or
+by arrangement made by the same entity you are acting on behalf of,
+you may not add another; but you may replace the old one, on explicit
+permission from the previous publisher that added the old one.
+
+The author(s) and publisher(s) of the Document do not by this License
+give permission to use their names for publicity for or to assert or
+imply endorsement of any Modified Version.
+
+@item
+COMBINING DOCUMENTS
+
+You may combine the Document with other documents released under this
+License, under the terms defined in section 4 above for modified
+versions, provided that you include in the combination all of the
+Invariant Sections of all of the original documents, unmodified, and
+list them all as Invariant Sections of your combined work in its
+license notice, and that you preserve all their Warranty Disclaimers.
+
+The combined work need only contain one copy of this License, and
+multiple identical Invariant Sections may be replaced with a single
+copy.  If there are multiple Invariant Sections with the same name but
+different contents, make the title of each such section unique by
+adding at the end of it, in parentheses, the name of the original
+author or publisher of that section if known, or else a unique number.
+Make the same adjustment to the section titles in the list of
+Invariant Sections in the license notice of the combined work.
+
+In the combination, you must combine any sections Entitled ``History''
+in the various original documents, forming one section Entitled
+``History''; likewise combine any sections Entitled ``Acknowledgements'',
+and any sections Entitled ``Dedications''.  You must delete all
+sections Entitled ``Endorsements.''
+
+@item
+COLLECTIONS OF DOCUMENTS
+
+You may make a collection consisting of the Document and other documents
+released under this License, and replace the individual copies of this
+License in the various documents with a single copy that is included in
+the collection, provided that you follow the rules of this License for
+verbatim copying of each of the documents in all other respects.
+
+You may extract a single document from such a collection, and distribute
+it individually under this License, provided you insert a copy of this
+License into the extracted document, and follow this License in all
+other respects regarding verbatim copying of that document.
+
+@item
+AGGREGATION WITH INDEPENDENT WORKS
+
+A compilation of the Document or its derivatives with other separate
+and independent documents or works, in or on a volume of a storage or
+distribution medium, is called an ``aggregate'' if the copyright
+resulting from the compilation is not used to limit the legal rights
+of the compilation's users beyond what the individual works permit.
+When the Document is included in an aggregate, this License does not
+apply to the other works in the aggregate which are not themselves
+derivative works of the Document.
+
+If the Cover Text requirement of section 3 is applicable to these
+copies of the Document, then if the Document is less than one half of
+the entire aggregate, the Document's Cover Texts may be placed on
+covers that bracket the Document within the aggregate, or the
+electronic equivalent of covers if the Document is in electronic form.
+Otherwise they must appear on printed covers that bracket the whole
+aggregate.
+
+@item
+TRANSLATION
+
+Translation is considered a kind of modification, so you may
+distribute translations of the Document under the terms of section 4.
+Replacing Invariant Sections with translations requires special
+permission from their copyright holders, but you may include
+translations of some or all Invariant Sections in addition to the
+original versions of these Invariant Sections.  You may include a
+translation of this License, and all the license notices in the
+Document, and any Warranty Disclaimers, provided that you also include
+the original English version of this License and the original versions
+of those notices and disclaimers.  In case of a disagreement between
+the translation and the original version of this License or a notice
+or disclaimer, the original version will prevail.
+
+If a section in the Document is Entitled ``Acknowledgements'',
+``Dedications'', or ``History'', the requirement (section 4) to Preserve
+its Title (section 1) will typically require changing the actual
+title.
+
+@item
+TERMINATION
+
+You may not copy, modify, sublicense, or distribute the Document
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense, or distribute it is void, and
+will automatically terminate your rights under this License.
+
+However, if you cease all violation of this License, then your license
+from a particular copyright holder is reinstated (a) provisionally,
+unless and until the copyright holder explicitly and finally
+terminates your license, and (b) permanently, if the copyright holder
+fails to notify you of the violation by some reasonable means prior to
+60 days after the cessation.
+
+Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, receipt of a copy of some or all of the same material does
+not give you any rights to use it.
+
+@item
+FUTURE REVISIONS OF THIS LICENSE
+
+The Free Software Foundation may publish new, revised versions
+of the GNU Free Documentation License from time to time.  Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.  See
+@uref{http://www.gnu.org/copyleft/}.
+
+Each version of the License is given a distinguishing version number.
+If the Document specifies that a particular numbered version of this
+License ``or any later version'' applies to it, you have the option of
+following the terms and conditions either of that specified version or
+of any later version that has been published (not as a draft) by the
+Free Software Foundation.  If the Document does not specify a version
+number of this License, you may choose any version ever published (not
+as a draft) by the Free Software Foundation.  If the Document
+specifies that a proxy can decide which future versions of this
+License can be used, that proxy's public statement of acceptance of a
+version permanently authorizes you to choose that version for the
+Document.
+
+@item
+RELICENSING
+
+``Massive Multiauthor Collaboration Site'' (or ``MMC Site'') means any
+World Wide Web server that publishes copyrightable works and also
+provides prominent facilities for anybody to edit those works.  A
+public wiki that anybody can edit is an example of such a server.  A
+``Massive Multiauthor Collaboration'' (or ``MMC'') contained in the
+site means any set of copyrightable works thus published on the MMC
+site.
+
+``CC-BY-SA'' means the Creative Commons Attribution-Share Alike 3.0
+license published by Creative Commons Corporation, a not-for-profit
+corporation with a principal place of business in San Francisco,
+California, as well as future copyleft versions of that license
+published by that same organization.
+
+``Incorporate'' means to publish or republish a Document, in whole or
+in part, as part of another Document.
+
+An MMC is ``eligible for relicensing'' if it is licensed under this
+License, and if all works that were first published under this License
+somewhere other than this MMC, and subsequently incorporated in whole
+or in part into the MMC, (1) had no cover texts or invariant sections,
+and (2) were thus incorporated prior to November 1, 2008.
+
+The operator of an MMC Site may republish an MMC contained in the site
+under CC-BY-SA on the same site at any time before August 1, 2009,
+provided the MMC is eligible for relicensing.
+
+@end enumerate
+
+@page
+@heading ADDENDUM: How to use this License for your documents
+
+To use this License in a document you have written, include a copy of
+the License in the document and put the following copyright and
+license notices just after the title page:
+
+@smallexample
+@group
+  Copyright (C)  @var{year}  @var{your name}.
+  Permission is granted to copy, distribute and/or modify this document
+  under the terms of the GNU Free Documentation License, Version 1.3
+  or any later version published by the Free Software Foundation;
+  with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
+  Texts.  A copy of the license is included in the section entitled ``GNU
+  Free Documentation License''.
+@end group
+@end smallexample
+
+If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
+replace the ``with@dots{}Texts.'' line with this:
+
+@smallexample
+@group
+    with the Invariant Sections being @var{list their titles}, with
+    the Front-Cover Texts being @var{list}, and with the Back-Cover Texts
+    being @var{list}.
+@end group
+@end smallexample
+
+If you have Invariant Sections without Cover Texts, or some other
+combination of the three, merge those two alternatives to suit the
+situation.
+
+If your document contains nontrivial examples of program code, we
+recommend releasing these examples in parallel under your choice of
+free software license, such as the GNU General Public License,
+to permit their use in free software.
+
+@c Local Variables:
+@c ispell-local-pdict: "ispell-dict"
+@c End:
+

+ 107 - 0
doc/chapters/fft-support.texi

@@ -0,0 +1,107 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+StarPU provides @code{libstarpufft}, a library whose design is very similar to
+both fftw and cufft, the difference being that it takes benefit from both CPUs
+and GPUs. It should however be noted that GPUs do not have the same precision as
+CPUs, so the results may different by a negligible amount
+
+float, double and long double precisions are available, with the fftw naming
+convention:
+
+@enumerate
+@item double precision structures and functions are named e.g. @code{starpufft_execute}
+@item float precision structures and functions are named e.g. @code{starpufftf_execute}
+@item long double precision structures and functions are named e.g. @code{starpufftl_execute}
+@end enumerate
+
+The documentation below uses names for double precision, replace
+@code{starpufft_} with @code{starpufftf_} or @code{starpufftl_} as appropriate.
+
+Only complex numbers are supported at the moment.
+
+The application has to call @code{starpu_init} before calling starpufft functions.
+
+Either main memory pointers or data handles can be provided.
+
+@enumerate
+@item To provide main memory pointers, use @code{starpufft_start} or
+@code{starpufft_execute}. Only one FFT can be performed at a time, because
+StarPU will have to register the data on the fly. In the @code{starpufft_start}
+case, @code{starpufft_cleanup} needs to be called to unregister the data.
+@item To provide data handles (which is preferrable),
+use @code{starpufft_start_handle} (preferred) or
+@code{starpufft_execute_handle}. Several FFTs Several FFT tasks can be submitted
+for a given plan, which permits e.g. to start a series of FFT with just one
+plan. @code{starpufft_start_handle} is preferrable since it does not wait for
+the task completion, and thus permits to enqueue a series of tasks.
+@end enumerate
+
+@section Compilation
+
+The flags required to compile or link against the FFT library are accessible
+with the following commands:
+
+@example
+% pkg-config --cflags starpufft-1.0  # options for the compiler
+% pkg-config --libs starpufft-1.0    # options for the linker
+@end example
+
+Also pass the @code{--static} option if the application is to be linked statically.
+
+@section Initialisation
+
+@deftypefun {void *} starpufft_malloc (size_t @var{n})
+Allocates memory for @var{n} bytes. This is preferred over @code{malloc}, since
+it allocates pinned memory, which allows overlapped transfers.
+@end deftypefun
+
+@deftypefun {void *} starpufft_free (void *@var{p})
+Release memory previously allocated.
+@end deftypefun
+
+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_1d (int @var{n}, int @var{sign}, unsigned @var{flags})
+Initializes a plan for 1D FFT of size @var{n}. @var{sign} can be
+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
+@end deftypefun
+
+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_2d (int @var{n}, int @var{m}, int @var{sign}, unsigned @var{flags})
+Initializes a plan for 2D FFT of size (@var{n}, @var{m}). @var{sign} can be
+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpufft_start (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
+Start an FFT previously planned as @var{p}, using @var{in} and @var{out} as
+input and output. This only submits the task and does not wait for it.
+The application should call @code{starpufft_cleanup} to unregister the data.
+@end deftypefun
+
+@deftypefun {struct starpu_task *} starpufft_start_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
+Start an FFT previously planned as @var{p}, using data handles @var{in} and
+@var{out} as input and output (assumed to be vectors of elements of the expected
+types). This only submits the task and does not wait for it.
+@end deftypefun
+
+@deftypefun void starpufft_execute (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
+Execute an FFT previously planned as @var{p}, using @var{in} and @var{out} as
+input and output. This submits and waits for the task.
+@end deftypefun
+
+@deftypefun void starpufft_execute_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
+Execute an FFT previously planned as @var{p}, using data handles @var{in} and
+@var{out} as input and output (assumed to be vectors of elements of the expected
+types). This submits and waits for the task.
+@end deftypefun
+
+@deftypefun void starpufft_cleanup (starpufft_plan @var{p})
+Releases data for plan @var{p}, in the @code{starpufft_start} case.
+@end deftypefun
+
+@deftypefun void starpufft_destroy_plan (starpufft_plan @var{p})
+Destroys plan @var{p}, i.e. release all CPU (fftw) and GPU (cufft) resources.
+@end deftypefun

+ 130 - 0
doc/chapters/installing.texi

@@ -0,0 +1,130 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Downloading StarPU::          
+* Configuration of StarPU::     
+* Building and Installing StarPU::  
+@end menu
+
+StarPU can be built and installed by the standard means of the GNU
+autotools. The following chapter is intended to briefly remind how these tools
+can be used to install StarPU.
+
+@node Downloading StarPU
+@section Downloading StarPU
+
+@menu
+* Getting Sources::             
+* Optional dependencies::       
+@end menu
+
+@node Getting Sources
+@subsection Getting Sources
+
+The latest official release tarballs of StarPU sources are available
+for download from
+@indicateurl{https://gforge.inria.fr/frs/?group_id=1570}.
+
+The latest nightly development snapshot is available from
+@indicateurl{http://starpu.gforge.inria.fr/testing/}.
+
+@example
+% wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
+@end example
+
+Additionally, the code can be directly checked out of Subversion, it
+should be done only if you need the very latest changes (i.e. less
+than a day!).@footnote{The client side of the software Subversion can
+be obtained from @indicateurl{http://subversion.tigris.org}. If you
+are running on Windows, you will probably prefer to use TortoiseSVN
+from @indicateurl{http://tortoisesvn.tigris.org/}}.
+
+@example
+% svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
+@end example
+
+@node Optional dependencies
+@subsection Optional dependencies
+
+The topology discovery library, @code{hwloc}, is not mandatory to use StarPU
+but strongly recommended. It allows to increase performance, and to
+perform some topology aware scheduling.
+
+@code{hwloc} is available in major distributions and for most OSes and can be
+downloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.
+
+@node Configuration of StarPU
+@section Configuration of StarPU
+
+@menu
+* Generating Makefiles and configuration scripts::  
+* Running the configuration::   
+@end menu
+
+@node Generating Makefiles and configuration scripts
+@subsection Generating Makefiles and configuration scripts
+
+This step is not necessary when using the tarball releases of StarPU.  If you
+are using the source code from the svn repository, you first need to generate
+the configure scripts and the Makefiles. This requires the
+availability of @code{autoconf}, @code{automake} >= 2.60, and @code{makeinfo}.
+
+@example
+% ./autogen.sh
+@end example
+
+@node Running the configuration
+@subsection Running the configuration
+
+@example
+% ./configure
+@end example
+
+Details about options that are useful to give to @code{./configure} are given in
+@ref{Compilation configuration}.
+
+@node Building and Installing StarPU
+@section Building and Installing StarPU
+
+@menu
+* Building::                    
+* Sanity Checks::               
+* Installing::                  
+@end menu
+
+@node Building
+@subsection Building
+
+@example
+% make
+@end example
+
+@node Sanity Checks
+@subsection Sanity Checks
+
+In order to make sure that StarPU is working properly on the system, it is also
+possible to run a test suite.
+
+@example
+% make check
+@end example
+
+@node Installing
+@subsection Installing
+
+In order to install StarPU at the location that was specified during
+configuration:
+
+@example
+% make install
+@end example
+
+Libtool interface versioning information are included in
+libraries names (libstarpu-1.0.so, libstarpumpi-1.0.so and
+libstarpufft-1.0.so).

+ 186 - 0
doc/chapters/introduction.texi

@@ -0,0 +1,186 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Motivation::                  Why StarPU ?
+* StarPU in a Nutshell::        The Fundamentals of StarPU
+@end menu
+
+@node Motivation
+@section Motivation
+
+@c complex machines with heterogeneous cores/devices
+The use of specialized hardware such as accelerators or coprocessors offers an
+interesting approach to overcome the physical limits encountered by processor
+architects. As a result, many machines are now equipped with one or several
+accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
+efforts have been devoted to offload computation onto such accelerators, very
+little attention as been paid to portability concerns on the one hand, and to the
+possibility of having heterogeneous accelerators and processors to interact on the other hand.
+
+StarPU is a runtime system that offers support for heterogeneous multicore
+architectures, it not only offers a unified view of the computational resources
+(i.e. CPUs and accelerators at the same time), but it also takes care of
+efficiently mapping and executing tasks onto an heterogeneous machine while
+transparently handling low-level issues such as data transfers in a portable
+fashion.
+
+@c this leads to a complicated distributed memory design
+@c which is not (easily) manageable by hand
+
+@c added value/benefits of StarPU
+@c   - portability
+@c   - scheduling, perf. portability
+
+@node StarPU in a Nutshell
+@section StarPU in a Nutshell
+
+StarPU is a software tool aiming to allow programmers to exploit the
+computing power of the available CPUs and GPUs, while relieving them
+from the need to specially adapt their programs to the target machine
+and processing units.
+
+At the core of StarPU is its run-time support library, which is
+responsible for scheduling application-provided tasks on heterogeneous
+CPU/GPU machines.  In addition, StarPU comes with programming language
+support, in the form of extensions to languages of the C family
+(@pxref{C Extensions}), as well as an OpenCL front-end (@pxref{SOCL
+OpenCL Extensions}).
+
+@cindex task-based programming model
+StarPU's run-time and programming language extensions support a
+@dfn{task-based programming model}.  Applications submit computational
+tasks, with CPU and/or GPU implementations, and StarPU schedules these
+tasks and associated data transfers on available CPUs and GPUs.  The
+data that a task manipulates are automatically transferred among
+accelerators and the main memory, so that programmers are freed from the
+scheduling issues and technical details associated with these transfers.
+
+StarPU takes particular care of scheduling tasks efficiently, using
+well-known algorithms from the literature (@pxref{Task scheduling
+policy}).  In addition, it allows scheduling experts, such as compiler
+or computational library developers, to implement custom scheduling
+policies in a portable fashion (@pxref{Scheduling Policy API}).
+
+The remainder of this section describes the main concepts used in StarPU.
+
+@menu
+* Codelet and Tasks::           
+* StarPU Data Management Library::  
+* Glossary::
+* Research Papers::
+@end menu
+
+@c explain the notion of codelet and task (i.e. g(A, B)
+@node Codelet and Tasks
+@subsection Codelet and Tasks
+
+One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
+computational kernel that can possibly be implemented on multiple architectures
+such as a CPU, a CUDA device or a Cell's SPU.
+
+@c TODO insert illustration f: f_spu, f_cpu, ...
+
+Another important data structure is the @b{task}. Executing a StarPU task
+consists in applying a codelet on a data set, on one of the architectures on
+which the codelet is implemented. A task thus describes the codelet that it
+uses, but also which data are accessed, and how they are
+accessed during the computation (read and/or write).
+StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
+operation. The task structure can also specify a @b{callback} function that is
+called once StarPU has properly executed the task. It also contains optional
+fields that the application may use to give hints to the scheduler (such as
+priority levels).
+
+By default, task dependencies are inferred from data dependency (sequential
+coherence) by StarPU. The application can however disable sequential coherency
+for some data, and dependencies be expressed by hand.
+A task may be identified by a unique 64-bit number chosen by the application
+which we refer as a @b{tag}.
+Task dependencies can be enforced by hand either by the means of callback functions, by
+submitting other tasks, or by expressing dependencies
+between tags (which can thus correspond to tasks that have not been submitted
+yet).
+
+@c TODO insert illustration f(Ar, Brw, Cr) + ..
+
+@c DSM
+@node StarPU Data Management Library
+@subsection StarPU Data Management Library
+
+Because StarPU schedules tasks at runtime, data transfers have to be
+done automatically and ``just-in-time'' between processing units,
+relieving the application programmer from explicit data transfers.
+Moreover, to avoid unnecessary transfers, StarPU keeps data
+where it was last needed, even if was modified there, and it
+allows multiple copies of the same data to reside at the same time on
+several processing units as long as it is not modified.
+
+@node Glossary
+@subsection Glossary
+
+A @b{codelet} records pointers to various implementations of the same
+theoretical function.
+
+A @b{memory node} can be either the main RAM or GPU-embedded memory.
+
+A @b{bus} is a link between memory nodes.
+
+A @b{data handle} keeps track of replicates of the same data (@b{registered} by the
+application) over various memory nodes. The data management library manages
+keeping them coherent.
+
+The @b{home} memory node of a data handle is the memory node from which the data
+was registered (usually the main memory node).
+
+A @b{task} represents a scheduled execution of a codelet on some data handles.
+
+A @b{tag} is a rendez-vous point. Tasks typically have their own tag, and can
+depend on other tags. The value is chosen by the application.
+
+A @b{worker} execute tasks. There is typically one per CPU computation core and
+one per accelerator (for which a whole CPU core is dedicated).
+
+A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
+OpenCL and Gordon drivers. They usually start several workers to actually drive
+them.
+
+A @b{performance model} is a (dynamic or static) model of the performance of a
+given codelet. Codelets can have execution time performance model as well as
+power consumption performance models.
+
+A data @b{interface} describes the layout of the data: for a vector, a pointer
+for the start, the number of elements and the size of elements ; for a matrix, a
+pointer for the start, the number of elements per row, the offset between rows,
+and the size of each element ; etc. To access their data, codelet functions are
+given interfaces for the local memory node replicates of the data handles of the
+scheduled task.
+
+@b{Partitioning} data means dividing the data of a given data handle (called
+@b{father}) into a series of @b{children} data handles which designate various
+portions of the former.
+
+A @b{filter} is the function which computes children data handles from a father
+data handle, and thus describes how the partitioning should be done (horizontal,
+vertical, etc.)
+
+@b{Acquiring} a data handle can be done from the main application, to safely
+access the data of a data handle from its home node, without having to
+unregister it.
+
+
+@node Research Papers
+@subsection Research Papers
+
+Research papers about StarPU can be found at
+
+@indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}
+
+Notably a good overview in the research report
+
+@indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}

+ 432 - 0
doc/chapters/mpi-support.texi

@@ -0,0 +1,432 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+The integration of MPI transfers within task parallelism is done in a
+very natural way by the means of asynchronous interactions between the
+application and StarPU.  This is implemented in a separate libstarpumpi library
+which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
+@code{void *} buffers are replaced with @code{starpu_data_handle_t}s, and all
+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
+the different MPI nodes.
+
+An MPI Insert Task function provides an even more seamless transition to a
+distributed application, by automatically issuing all required data transfers
+according to the task graph and an application-provided distribution.
+
+@menu
+* The API::                     
+* Simple Example::              
+* MPI Insert Task Utility::         
+* MPI Collective Operations::         
+@end menu
+
+@node The API
+@section The API
+
+@subsection Compilation
+
+The flags required to compile or link against the MPI layer are then
+accessible with the following commands:
+
+@example
+% pkg-config --cflags starpumpi-1.0  # options for the compiler
+% pkg-config --libs starpumpi-1.0    # options for the linker
+@end example
+
+Also pass the @code{--static} option if the application is to be linked statically.
+
+@subsection Initialisation
+
+@deftypefun int starpu_mpi_initialize (void)
+Initializes the starpumpi library. This must be called between calling
+@code{starpu_init} and other @code{starpu_mpi} functions. This
+function does not call @code{MPI_Init}, it should be called beforehand.
+@end deftypefun
+
+@deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
+Initializes the starpumpi library. This must be called between calling
+@code{starpu_init} and other @code{starpu_mpi} functions.
+This function calls @code{MPI_Init}, and therefore should be prefered
+to the previous one for MPI implementations which are not thread-safe.
+Returns the current MPI node rank and world size.
+@end deftypefun
+
+@deftypefun int starpu_mpi_shutdown (void)
+Cleans the starpumpi library. This must be called between calling
+@code{starpu_mpi} functions and @code{starpu_shutdown}.
+@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
+by calling @code{starpu_mpi_initialize_extended}.
+@end deftypefun
+
+@subsection Communication
+
+@deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
+@end deftypefun
+
+@deftypefun int starpu_mpi_recv (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
+
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_detached (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_detached (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
+@end deftypefun
+
+@deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
+@end deftypefun
+
+@deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
+@end deftypefun
+
+@deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
+When the transfer is completed, the tag is unlocked
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
+@end deftypefun
+
+@deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
+Asynchronously send an array of buffers, and unlocks the tag once all
+of them are transmitted.
+@end deftypefun
+
+@deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
+@end deftypefun
+
+@page
+@node Simple Example
+@section Simple Example
+
+@cartouche
+@smallexample
+void increment_token(void)
+@{
+    struct starpu_task *task = starpu_task_create();
+
+    task->cl = &increment_cl;
+    task->handles[0] = token_handle;
+
+    starpu_task_submit(task);
+@}
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+int main(int argc, char **argv)
+@{
+    int rank, size;
+
+    starpu_init(NULL);
+    starpu_mpi_initialize_extended(&rank, &size);
+
+    starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
+
+    unsigned nloops = NITER;
+    unsigned loop;
+
+    unsigned last_loop = nloops - 1;
+    unsigned last_rank = size - 1;
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+    for (loop = 0; loop < nloops; loop++) @{
+        int tag = loop*size + rank;
+
+        if (loop == 0 && rank == 0)
+        @{
+            token = 0;
+            fprintf(stdout, "Start with token value %d\n", token);
+        @}
+        else
+        @{
+            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
+                    MPI_COMM_WORLD, NULL, NULL);
+        @}
+
+        increment_token();
+
+        if (loop == last_loop && rank == last_rank)
+        @{
+            starpu_data_acquire(token_handle, STARPU_R);
+            fprintf(stdout, "Finished: token value %d\n", token);
+            starpu_data_release(token_handle);
+        @}
+        else
+        @{
+            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
+                    MPI_COMM_WORLD, NULL, NULL);
+        @}
+    @}
+
+    starpu_task_wait_for_all();
+@end smallexample
+@end cartouche
+
+@cartouche
+@smallexample
+    starpu_mpi_shutdown();
+    starpu_shutdown();
+
+    if (rank == last_rank)
+    @{
+        fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
+        STARPU_ASSERT(token == nloops*size);
+    @}
+@end smallexample
+@end cartouche
+
+@page
+@node MPI Insert Task Utility
+@section MPI Insert Task Utility
+
+To save the programmer from having to explicit all communications, StarPU
+provides an "MPI Insert Task Utility". The principe is that the application
+decides a distribution of the data over the MPI nodes by allocating it and
+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
+data. All MPI nodes then process the whole task graph, and StarPU automatically
+determines which node actually execute which task, as well as the required MPI
+transfers.
+
+@deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
+Tell StarPU-MPI which MPI tag to use when exchanging the data.
+@end deftypefun
+
+@deftypefun int starpu_data_get_tag (starpu_data_handle_t @var{handle})
+Returns the MPI tag to be used when exchanging the data.
+@end deftypefun
+
+@deftypefun int starpu_data_set_rank (starpu_data_handle_t @var{handle}, int @var{rank})
+Tell StarPU-MPI which MPI node "owns" a given data, that is, the node which will
+always keep an up-to-date value, and will by default execute tasks which write
+to it.
+@end deftypefun
+
+@deftypefun int starpu_data_get_rank (starpu_data_handle_t @var{handle})
+Returns the last value set by @code{starpu_data_set_rank}.
+@end deftypefun
+
+@defmac STARPU_EXECUTE_ON_NODE
+this macro is used when calling @code{starpu_mpi_insert_task}, and
+must be followed by a integer value which specified the node on which
+to execute the codelet.
+@end defmac
+
+@defmac STARPU_EXECUTE_ON_DATA
+this macro is used when calling @code{starpu_mpi_insert_task}, and
+must be followed by a data handle to specify that the node owning the
+given data will execute the codelet.
+@end defmac
+
+@deftypefun int starpu_mpi_insert_task (MPI_Comm @var{comm}, struct starpu_codelet *@var{codelet}, ...)
+Create and submit a task corresponding to @var{codelet} with the following
+arguments.  The argument list must be zero-terminated.
+
+The arguments following the codelets are the same types as for the
+function @code{starpu_insert_task} defined in @ref{Insert Task
+Utility}. The extra argument @code{STARPU_EXECUTE_ON_NODE} followed by an
+integer allows to specify the MPI node to execute the codelet. It is also
+possible to specify that the node owning a specific data will execute
+the codelet, by using @code{STARPU_EXECUTE_ON_DATA} followed by a data
+handle.
+
+The internal algorithm is as follows:
+@enumerate
+@item Find out whether we (as an MPI node) are to execute the codelet
+because we own the data to be written to. If different nodes own data
+to be written to, the argument @code{STARPU_EXECUTE_ON_NODE} or
+@code{STARPU_EXECUTE_ON_DATA} has to be used to specify which MPI node will
+execute the task.
+@item Send and receive data as requested. Nodes owning data which need to be
+read by the task are sending them to the MPI node which will execute it. The
+latter receives them.
+@item Execute the codelet. This is done by the MPI node selected in the
+1st step of the algorithm.
+@item In the case when different MPI nodes own data to be written to, send
+written data back to their owners.
+@end enumerate
+
+The algorithm also includes a cache mechanism that allows not to send
+data twice to the same MPI node, unless the data has been modified.
+
+@end deftypefun
+
+@deftypefun void starpu_mpi_get_data_on_node (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle}, int @var{node})
+Transfer data @var{data_handle} to MPI node @var{node}, sending it from its
+owner if needed. At least the target node and the owner have to call the
+function.
+@end deftypefun
+
+Here an stencil example showing how to use @code{starpu_mpi_insert_task}. One
+first needs to define a distribution function which specifies the
+locality of the data. Note that that distribution information needs to
+be given to StarPU by calling @code{starpu_data_set_rank}.
+
+@cartouche
+@smallexample
+/* Returns the MPI node number where data is */
+int my_distrib(int x, int y, int nb_nodes) @{
+  /* Block distrib */
+  return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
+
+  // /* Other examples useful for other kinds of computations */
+  // /* / distrib */
+  // return (x+y) % nb_nodes;
+
+  // /* Block cyclic distrib */
+  // unsigned side = sqrt(nb_nodes);
+  // return x % side + (y % side) * size;
+@}
+@end smallexample
+@end cartouche
+
+Now the data can be registered within StarPU. Data which are not
+owned but will be needed for computations can be registered through
+the lazy allocation mechanism, i.e. with a @code{home_node} set to -1.
+StarPU will automatically allocate the memory when it is used for the
+first time.
+
+One can note an optimization here (the @code{else if} test): we only register
+data which will be needed by the tasks that we will execute.
+
+@cartouche
+@smallexample
+    unsigned matrix[X][Y];
+    starpu_data_handle_t data_handles[X][Y];
+
+    for(x = 0; x < X; x++) @{
+        for (y = 0; y < Y; y++) @{
+            int mpi_rank = my_distrib(x, y, size);
+             if (mpi_rank == my_rank)
+                /* Owning data */
+                starpu_variable_data_register(&data_handles[x][y], 0,
+                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
+            else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
+                  || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
+                /* I don't own that index, but will need it for my computations */
+                starpu_variable_data_register(&data_handles[x][y], -1,
+                                              (uintptr_t)NULL, sizeof(unsigned));
+            else
+                /* I know it's useless to allocate anything for this */
+                data_handles[x][y] = NULL;
+            if (data_handles[x][y])
+                starpu_data_set_rank(data_handles[x][y], mpi_rank);
+        @}
+    @}
+@end smallexample
+@end cartouche
+
+Now @code{starpu_mpi_insert_task()} can be called for the different
+steps of the application.
+
+@cartouche
+@smallexample
+    for(loop=0 ; loop<niter; loop++)
+        for (x = 1; x < X-1; x++)
+            for (y = 1; y < Y-1; y++)
+                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
+                                       STARPU_RW, data_handles[x][y],
+                                       STARPU_R, data_handles[x-1][y],
+                                       STARPU_R, data_handles[x+1][y],
+                                       STARPU_R, data_handles[x][y-1],
+                                       STARPU_R, data_handles[x][y+1],
+                                       0);
+    starpu_task_wait_for_all();
+@end smallexample
+@end cartouche
+
+I.e. all MPI nodes process the whole task graph, but as mentioned above, for
+each task, only the MPI node which owns the data being written to (here,
+@code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
+automatically send the required data.
+
+@node MPI Collective Operations
+@section MPI Collective Operations
+
+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+Scatter data among processes of the communicator based on the ownership of
+the data. For each data of the array @var{data_handles}, the
+process @var{root} sends the data to the process owning this data.
+Processes receiving data must have valid data handles to receive them.
+@end deftypefun
+
+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
+Gather data from the different processes of the communicator onto the
+process @var{root}. Each process owning data handle in the array
+@var{data_handles} will send them to the process @var{root}. The
+process @var{root} must have valid data handles to receive the data.
+@end deftypefun
+
+@page
+@cartouche
+@smallexample
+if (rank == root)
+@{
+    /* Allocate the vector */
+    vector = malloc(nblocks * sizeof(float *));
+    for(x=0 ; x<nblocks ; x++)
+    @{
+        starpu_malloc((void **)&vector[x], block_size*sizeof(float));
+    @}
+@}
+
+/* Allocate data handles and register data to StarPU */
+data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+for(x = 0; x < nblocks ;  x++)
+@{
+    int mpi_rank = my_distrib(x, nodes);
+    if (rank == root) @{
+        starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
+                                    blocks_size, sizeof(float));
+    @}
+    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{
+        /* I own that index, or i will need it for my computations */
+        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
+                                   block_size, sizeof(float));
+    @}
+    else @{
+        /* I know it's useless to allocate anything for this */
+        data_handles[x] = NULL;
+    @}
+    if (data_handles[x]) @{
+        starpu_data_set_rank(data_handles[x], mpi_rank);
+    @}
+@}
+
+/* Scatter the matrix among the nodes */
+starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
+
+/* Calculation */
+for(x = 0; x < nblocks ;  x++) @{
+    if (data_handles[x]) @{
+        int owner = starpu_data_get_rank(data_handles[x]);
+        if (owner == rank) @{
+            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
+        @}
+    @}
+@}
+
+/* Gather the matrix on main node */
+starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
+@end smallexample
+@end cartouche
+
+

+ 429 - 0
doc/chapters/perf-feedback.texi

@@ -0,0 +1,429 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* On-line::                     On-line performance feedback
+* Off-line::                    Off-line performance feedback
+* Codelet performance::         Performance of codelets
+* Theoretical lower bound on execution time API::  
+@end menu
+
+@node On-line
+@section On-line performance feedback
+
+@menu
+* Enabling monitoring::         Enabling on-line performance monitoring
+* Task feedback::               Per-task feedback
+* Codelet feedback::            Per-codelet feedback
+* Worker feedback::             Per-worker feedback
+* Bus feedback::                Bus-related feedback
+* StarPU-Top::                  StarPU-Top interface
+@end menu
+
+@node Enabling monitoring
+@subsection Enabling on-line performance monitoring
+
+In order to enable online performance monitoring, the application can call
+@code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible to
+detect whether monitoring is already enabled or not by calling
+@code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize all
+previously collected feedback. The @code{STARPU_PROFILING} environment variable
+can also be set to 1 to achieve the same effect.
+
+Likewise, performance monitoring is stopped by calling
+@code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that this
+does not reset the performance counters so that the application may consult
+them later on.
+
+More details about the performance monitoring API are available in section
+@ref{Profiling API}.
+
+@node Task feedback
+@subsection Per-task feedback
+
+If profiling is enabled, a pointer to a @code{starpu_task_profiling_info}
+structure is put in the @code{.profiling_info} field of the @code{starpu_task}
+structure when a task terminates.
+This structure is automatically destroyed when the task structure is destroyed,
+either automatically or by calling @code{starpu_task_destroy}.
+
+The @code{starpu_task_profiling_info} structure indicates the date when the
+task was submitted (@code{submit_time}), started (@code{start_time}), and
+terminated (@code{end_time}), relative to the initialization of
+StarPU with @code{starpu_init}. It also specifies the identifier of the worker
+that has executed the task (@code{workerid}).
+These date are stored as @code{timespec} structures which the user may convert
+into micro-seconds using the @code{starpu_timing_timespec_to_us} helper
+function.
+
+It it worth noting that the application may directly access this structure from
+the callback executed at the end of the task. The @code{starpu_task} structure
+associated to the callback currently being executed is indeed accessible with
+the @code{starpu_get_current_task()} function.
+
+@node Codelet feedback
+@subsection Per-codelet feedback
+
+The @code{per_worker_stats} field of the @code{struct starpu_codelet} structure is
+an array of counters. The i-th entry of the array is incremented every time a
+task implementing the codelet is executed on the i-th worker.
+This array is not reinitialized when profiling is enabled or disabled.
+
+@node Worker feedback
+@subsection Per-worker feedback
+
+The second argument returned by the @code{starpu_worker_get_profiling_info}
+function is a @code{starpu_worker_profiling_info} structure that gives
+statistics about the specified worker. This structure specifies when StarPU
+started collecting profiling information for that worker (@code{start_time}),
+the duration of the profiling measurement interval (@code{total_time}), the
+time spent executing kernels (@code{executing_time}), the time spent sleeping
+because there is no task to execute at all (@code{sleeping_time}), and the
+number of tasks that were executed while profiling was enabled.
+These values give an estimation of the proportion of time spent do real work,
+and the time spent either sleeping because there are not enough executable
+tasks or simply wasted in pure StarPU overhead. 
+
+Calling @code{starpu_worker_get_profiling_info} resets the profiling
+information associated to a worker.
+
+When an FxT trace is generated (see @ref{Generating traces}), it is also
+possible to use the @code{starpu_top} script (described in @ref{starpu-top}) to
+generate a graphic showing the evolution of these values during the time, for
+the different workers.
+
+@node Bus feedback
+@subsection Bus-related feedback 
+
+TODO: ajouter STARPU_BUS_STATS
+
+@c how to enable/disable performance monitoring
+
+@c what kind of information do we get ?
+
+The bus speed measured by StarPU can be displayed by using the
+@code{starpu_machine_display} tool, for instance:
+
+@example
+StarPU has found:
+        3 CUDA devices
+                CUDA 0 (Tesla C2050 02:00.0)
+                CUDA 1 (Tesla C2050 03:00.0)
+                CUDA 2 (Tesla C2050 84:00.0)
+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
+RAM     0.000000        5176.530428     5176.492994     5191.710722
+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
+@end example
+
+@node StarPU-Top
+@subsection StarPU-Top interface
+
+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
+application and permits the user to change parameters on the fly.
+
+Variables to be monitored can be registered by calling the
+@code{starpu_top_add_data_boolean}, @code{starpu_top_add_data_integer},
+@code{starpu_top_add_data_float} functions, e.g.:
+
+@cartouche
+@smallexample
+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
+@end smallexample
+@end cartouche
+
+The application should then call @code{starpu_top_init_and_wait} to give its name
+and wait for StarPU-Top to get a start request from the user. The name is used
+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
+
+@cartouche
+@smallexample
+starpu_top_init_and_wait("the application");
+@end smallexample
+@end cartouche
+
+The new values can then be provided thanks to
+@code{starpu_top_update_data_boolean}, @code{starpu_top_update_data_integer},
+@code{starpu_top_update_data_float}, e.g.:
+
+@cartouche
+@smallexample
+starpu_top_update_data_integer(data, mynum);
+@end smallexample
+@end cartouche
+
+Updateable parameters can be registered thanks to @code{starpu_top_register_parameter_boolean}, @code{starpu_top_register_parameter_integer}, @code{starpu_top_register_parameter_float}, e.g.:
+
+@cartouche
+@smallexample
+float alpha;
+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
+@end smallexample
+@end cartouche
+
+@code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
+
+@cartouche
+@smallexample
+void modif_hook(struct starpu_top_param *d) @{
+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
+@}
+@end smallexample
+@end cartouche
+
+Task schedulers should notify StarPU-Top when it has decided when a task will be
+scheduled, so that it can show it in its Gantt chart, for instance:
+
+@cartouche
+@smallexample
+starpu_top_task_prevision(task, workerid, begin, end);
+@end smallexample
+@end cartouche
+
+Starting StarPU-Top and the application can be done two ways:
+
+@itemize
+@item The application is started by hand on some machine (and thus already
+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
+checkbox should be unchecked, and the hostname and port (default is 2011) on
+which the application is already running should be specified. Clicking on the
+connection button will thus connect to the already-running application.
+@item StarPU-Top is started first, and clicking on the connection button will
+start the application itself (possibly on a remote machine). The SSH checkbox
+should be checked, and a command line provided, e.g.:
+
+@example
+ssh myserver STARPU_SCHED=heft ./application
+@end example
+
+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
+
+@example
+ssh -L 2011:localhost:2011 myserver STARPU_SCHED=heft ./application
+@end example
+
+and "localhost" should be used as IP Address to connect to.
+@end itemize
+
+@node Off-line
+@section Off-line performance feedback
+
+@menu
+* Generating traces::           Generating traces with FxT
+* Gantt diagram::               Creating a Gantt Diagram
+* DAG::                         Creating a DAG with graphviz
+* starpu-top::                  Monitoring activity
+@end menu
+
+@node Generating traces
+@subsection Generating traces with FxT
+
+StarPU can use the FxT library (see
+@indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traces
+with a limited runtime overhead.
+
+You can either get a tarball:
+@example
+% wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz
+@end example
+
+or use the FxT library from CVS (autotools are required):
+@example
+% cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
+% ./bootstrap
+@end example
+
+Compiling and installing the FxT library in the @code{$FXTDIR} path is
+done following the standard procedure:
+@example
+% ./configure --prefix=$FXTDIR
+% make
+% make install
+@end example
+
+In order to have StarPU to generate traces, StarPU should be configured with
+the @code{--with-fxt} option:
+@example
+$ ./configure --with-fxt=$FXTDIR
+@end example
+
+Or you can simply point the @code{PKG_CONFIG_PATH} to
+@code{$FXTDIR/lib/pkgconfig} and pass @code{--with-fxt} to @code{./configure}
+
+When FxT is enabled, a trace is generated when StarPU is terminated by calling
+@code{starpu_shutdown()}). The trace is a binary file whose name has the form
+@code{prof_file_XXX_YYY} where @code{XXX} is the user name, and
+@code{YYY} is the pid of the process that used StarPU. This file is saved in the
+@code{/tmp/} directory by default, or by the directory specified by
+the @code{STARPU_FXT_PREFIX} environment variable.
+
+@node Gantt diagram
+@subsection Creating a Gantt Diagram
+
+When the FxT trace file @code{filename} has been generated, it is possible to
+generate a trace in the Paje format by calling:
+@example
+% starpu_fxt_tool -i filename
+@end example
+
+Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
+to 1 before application execution will make StarPU do it automatically at
+application shutdown.
+
+This will create a @code{paje.trace} file in the current directory that can be
+inspected with the ViTE trace visualizing open-source tool. More information
+about ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
+possible to open the @code{paje.trace} file with ViTE by using the following
+command:
+@example
+% vite paje.trace
+@end example
+
+@node DAG
+@subsection Creating a DAG with graphviz
+
+When the FxT trace file @code{filename} has been generated, it is possible to
+generate a task graph in the DOT format by calling:
+@example
+$ starpu_fxt_tool -i filename
+@end example
+
+This will create a @code{dag.dot} file in the current directory. This file is a
+task graph described using the DOT language. It is possible to get a
+graphical output of the graph by using the graphviz library:
+@example
+$ dot -Tpdf dag.dot -o output.pdf
+@end example
+
+@node starpu-top
+@subsection Monitoring activity
+
+When the FxT trace file @code{filename} has been generated, it is possible to
+generate an activity trace by calling:
+@example
+$ starpu_fxt_tool -i filename
+@end example
+
+This will create an @code{activity.data} file in the current
+directory. A profile of the application showing the activity of StarPU
+during the execution of the program can be generated:
+@example
+$ starpu_top activity.data
+@end example
+
+This will create a file named @code{activity.eps} in the current directory.
+This picture is composed of two parts.
+The first part shows the activity of the different workers. The green sections
+indicate which proportion of the time was spent executed kernels on the
+processing unit. The red sections indicate the proportion of time spent in
+StartPU: an important overhead may indicate that the granularity may be too
+low, and that bigger tasks may be appropriate to use the processing unit more
+efficiently. The black sections indicate that the processing unit was blocked
+because there was no task to process: this may indicate a lack of parallelism
+which may be alleviated by creating more tasks when it is possible.
+
+The second part of the @code{activity.eps} picture is a graph showing the
+evolution of the number of tasks available in the system during the execution.
+Ready tasks are shown in black, and tasks that are submitted but not
+schedulable yet are shown in grey.
+
+@node Codelet performance
+@section Performance of codelets
+
+The performance model of codelets (described in @ref{Performance model example}) can be examined by using the
+@code{starpu_perfmodel_display} tool:
+
+@example
+$ starpu_perfmodel_display -l
+file: <malloc_pinned.hannibal>
+file: <starpu_slu_lu_model_21.hannibal>
+file: <starpu_slu_lu_model_11.hannibal>
+file: <starpu_slu_lu_model_22.hannibal>
+file: <starpu_slu_lu_model_12.hannibal>
+@end example
+
+Here, the codelets of the lu example are available. We can examine the
+performance of the 22 kernel (in micro-seconds):
+
+@example
+$ starpu_perfmodel_display -s starpu_slu_lu_model_22
+performance model for cpu
+# hash      size       mean          dev           n
+57618ab0    19660800   2.851069e+05  1.829369e+04  109
+performance model for cuda_0
+# hash      size       mean          dev           n
+57618ab0    19660800   1.164144e+04  1.556094e+01  315
+performance model for cuda_1
+# hash      size       mean          dev           n
+57618ab0    19660800   1.164271e+04  1.330628e+01  360
+performance model for cuda_2
+# hash      size       mean          dev           n
+57618ab0    19660800   1.166730e+04  3.390395e+02  456
+@end example
+
+We can see that for the given size, over a sample of a few hundreds of
+execution, the GPUs are about 20 times faster than the CPUs (numbers are in
+us). The standard deviation is extremely low for the GPUs, and less than 10% for
+CPUs.
+
+The @code{starpu_regression_display} tool does the same for regression-based
+performance models. It also writes a @code{.gp} file in the current directory,
+to be run in the @code{gnuplot} tool, which shows the corresponding curve.
+
+The same can also be achieved by using StarPU's library API, see
+@ref{Performance Model API} and notably the @code{starpu_load_history_debug}
+function. The source code of the @code{starpu_perfmodel_display} tool can be a
+useful example.
+
+@node Theoretical lower bound on execution time API
+@section Theoretical lower bound on execution time
+
+See @ref{Theoretical lower bound on execution time} for an example on how to use
+this API. It permits to record a trace of what tasks are needed to complete the
+application, and then, by using a linear system, provide a theoretical lower
+bound of the execution time (i.e. with an ideal scheduling).
+
+The computed bound is not really correct when not taking into account
+dependencies, but for an application which have enough parallelism, it is very
+near to the bound computed with dependencies enabled (which takes a huge lot
+more time to compute), and thus provides a good-enough estimation of the ideal
+execution time.
+
+@deftypefun void starpu_bound_start (int @var{deps}, int @var{prio})
+Start recording tasks (resets stats).  @var{deps} tells whether
+dependencies should be recorded too (this is quite expensive)
+@end deftypefun
+
+@deftypefun void starpu_bound_stop (void)
+Stop recording tasks
+@end deftypefun
+
+@deftypefun void starpu_bound_print_dot ({FILE *}@var{output})
+Print the DAG that was recorded
+@end deftypefun
+
+@deftypefun void starpu_bound_compute ({double *}@var{res}, {double *}@var{integer_res}, int @var{integer})
+Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script)
+@end deftypefun
+
+@deftypefun void starpu_bound_print_lp ({FILE *}@var{output})
+Emit the Linear Programming system on @var{output} for the recorded tasks, in
+the lp format
+@end deftypefun
+
+@deftypefun void starpu_bound_print_mps ({FILE *}@var{output})
+Emit the Linear Programming system on @var{output} for the recorded tasks, in
+the mps format
+@end deftypefun
+
+@deftypefun void starpu_bound_print ({FILE *}@var{output}, int @var{integer})
+Emit statistics of actual execution vs theoretical upper bound. @var{integer}
+permits to choose between integer solving (which takes a long time but is
+correct), and relaxed solving (which provides an approximate solution).
+@end deftypefun

+ 331 - 0
doc/chapters/perf-optimization.texi

@@ -0,0 +1,331 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+TODO: improve!
+
+@menu
+* Data management::
+* Task granularity::
+* Task submission::
+* Task priorities::
+* Task scheduling policy::
+* Performance model calibration::
+* Task distribution vs Data transfer::
+* Data prefetch::
+* Power-based scheduling::
+* Profiling::
+* CUDA-specific optimizations::
+* Performance debugging::
+@end menu
+
+Simply encapsulating application kernels into tasks already permits to
+seamlessly support CPU and GPUs at the same time. To achieve good performance, a
+few additional changes are needed.
+
+@node Data management
+@section Data management
+
+When the application allocates data, whenever possible it should use the
+@code{starpu_malloc} function, which will ask CUDA or
+OpenCL to make the allocation itself and pin the corresponding allocated
+memory. This is needed to permit asynchronous data transfer, i.e. permit data
+transfer to overlap with computations. Otherwise, the trace will show that the
+@code{DriverCopyAsync} state takes a lot of time, this is because CUDA or OpenCL
+then reverts to synchronous transfers.
+
+By default, StarPU leaves replicates of data wherever they were used, in case they
+will be re-used by other tasks, thus saving the data transfer time. When some
+task modifies some data, all the other replicates are invalidated, and only the
+processing unit which ran that task will have a valid replicate of the data. If the application knows
+that this data will not be re-used by further tasks, it should advise StarPU to
+immediately replicate it to a desired list of memory nodes (given through a
+bitmask). This can be understood like the write-through mode of CPU caches.
+
+@cartouche
+@smallexample
+starpu_data_set_wt_mask(img_handle, 1<<0);
+@end smallexample
+@end cartouche
+
+will for instance request to always automatically transfer a replicate into the
+main memory (node 0), as bit 0 of the write-through bitmask is being set.
+
+@cartouche
+@smallexample
+starpu_data_set_wt_mask(img_handle, ~0U);
+@end smallexample
+@end cartouche
+
+will request to always automatically broadcast the updated data to all memory
+nodes.
+
+Setting the write-through mask to @code{~0U} can also be useful to make sure all
+memory nodes always have a copy of the data, so that it is never evicted when
+memory gets scarse.
+
+Implicit data dependency computation can become expensive if a lot
+of tasks access the same piece of data. If no dependency is required
+on some piece of data (e.g. because it is only accessed in read-only
+mode, or because write accesses are actually commutative), use the
+@code{starpu_data_set_sequential_consistency_flag} function to disable implicit
+dependencies on that data.
+
+@node Task granularity
+@section Task granularity
+
+Like any other runtime, StarPU has some overhead to manage tasks. Since
+it does smart scheduling and data management, that overhead is not always
+neglectable. The order of magnitude of the overhead is typically a couple of
+microseconds. The amount of work that a task should do should thus be somewhat
+bigger, to make sure that the overhead becomes neglectible. The offline
+performance feedback can provide a measure of task length, which should thus be
+checked if bad performance are observed.
+
+@node Task submission
+@section Task submission
+
+To let StarPU make online optimizations, tasks should be submitted
+asynchronously as much as possible. Ideally, all the tasks should be
+submitted, and mere calls to @code{starpu_task_wait_for_all} or
+@code{starpu_data_unregister} be done to wait for
+termination. StarPU will then be able to rework the whole schedule, overlap
+computation with communication, manage accelerator local memory usage, etc.
+
+@node Task priorities
+@section Task priorities
+
+By default, StarPU will consider the tasks in the order they are submitted by
+the application. If the application programmer knows that some tasks should
+be performed in priority (for instance because their output is needed by many
+other tasks and may thus be a bottleneck if not executed early enough), the
+@code{priority} field of the task structure should be set to transmit the
+priority information to StarPU.
+
+@node Task scheduling policy
+@section Task scheduling policy
+
+By default, StarPU uses the @code{eager} simple greedy scheduler. This is
+because it provides correct load balance even if the application codelets do not
+have performance models. If your application codelets have performance models
+(@pxref{Performance model example} for examples showing how to do it),
+you should change the scheduler thanks to the @code{STARPU_SCHED} environment
+variable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to get
+the list of available schedulers.
+
+The @b{eager} scheduler uses a central task queue, from which workers draw tasks
+to work on. This however does not permit to prefetch data since the scheduling
+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
+
+The @b{prio} scheduler also uses a central task queue, but sorts tasks by
+priority (between -5 and 5).
+
+The @b{random} scheduler distributes tasks randomly according to assumed worker
+overall performance.
+
+The @b{ws} (work stealing) scheduler schedules tasks on the local worker by
+default. When a worker becomes idle, it steals a task from the most loaded
+worker.
+
+The @b{dm} (deque model) scheduler uses task execution performance models into account to
+perform an HEFT-similar scheduling strategy: it schedules tasks where their
+termination time will be minimal.
+
+The @b{dmda} (deque model data aware) scheduler is similar to dm, it also takes
+into account data transfer time.
+
+The @b{dmdar} (deque model data aware ready) scheduler is similar to dmda,
+it also sorts tasks on per-worker queues by number of already-available data
+buffers.
+
+The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
+also supports arbitrary priority values.
+
+The @b{heft} (heterogeneous earliest finish time) scheduler is similar to dmda, it also supports task bundles.
+
+The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
+parallel tasks (still experimental).
+
+The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
+supports parallel tasks (still experimental).
+
+@node Performance model calibration
+@section Performance model calibration
+
+Most schedulers are based on an estimation of codelet duration on each kind
+of processing unit. For this to be possible, the application programmer needs
+to configure a performance model for the codelets of the application (see
+@ref{Performance model example} for instance). History-based performance models
+use on-line calibration.  StarPU will automatically calibrate codelets
+which have never been calibrated yet, and save the result in
+@code{~/.starpu/sampling/codelets}.
+The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
+@code{export STARPU_CALIBRATE=1} . This may be necessary if your application
+has not-so-stable performance. StarPU will force calibration (and thus ignore
+the current result) until 10 (_STARPU_CALIBRATION_MINIMUM) measurements have been
+made on each architecture, to avoid badly scheduling tasks just because the
+first measurements were not so good. Details on the current performance model status
+can be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}
+option lists the available performance models, and the @code{-s} option permits
+to choose the performance model to be displayed. The result looks like:
+
+@example
+$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
+performance model for cpu
+# hash    size     mean          dev           n
+880805ba  98304    2.731309e+02  6.010210e+01  1240
+b50b6605  393216   1.469926e+03  1.088828e+02  1240
+5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
+@end example
+
+Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
+execution time on CPUs was about 11ms, with a 3ms standard deviation, over
+1240 samples. It is a good idea to check this before doing actual performance
+measurements.
+
+A graph can be drawn by using the @code{starpu_perfmodel_plot}:
+
+@example
+$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
+98304 393216 1572864 
+$ gnuplot starpu_starpu_dlu_lu_model_22.gp
+$ gv starpu_starpu_dlu_lu_model_22.eps
+@end example
+
+If a kernel source code was modified (e.g. performance improvement), the
+calibration information is stale and should be dropped, to re-calibrate from
+start. This can be done by using @code{export STARPU_CALIBRATE=2}.
+
+Note: due to CUDA limitations, to be able to measure kernel duration,
+calibration mode needs to disable asynchronous data transfers. Calibration thus
+disables data transfer / computation overlapping, and should thus not be used
+for eventual benchmarks. Note 2: history-based performance models get calibrated
+only if a performance-model-based scheduler is chosen.
+
+@node Task distribution vs Data transfer
+@section Task distribution vs Data transfer
+
+Distributing tasks to balance the load induces data transfer penalty. StarPU
+thus needs to find a balance between both. The target function that the
+@code{dmda} scheduler of StarPU
+tries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where
+@code{T_execution} is the estimated execution time of the codelet (usually
+accurate), and @code{T_data_transfer} is the estimated data transfer time. The
+latter is estimated based on bus calibration before execution start,
+i.e. with an idle machine, thus without contention. You can force bus re-calibration by running
+@code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can be
+worth trying to tweak it by using @code{export STARPU_SCHED_BETA=2} for instance,
+since during real application execution, contention makes transfer times bigger.
+This is of course imprecise, but in practice, a rough estimation already gives
+the good results that a precise estimation would give.
+
+@node Data prefetch
+@section Data prefetch
+
+The @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
+as soon as a scheduling decision is taken for a task, requests are issued to
+transfer its required data to the target processing unit, if needeed, so that
+when the processing unit actually starts the task, its data will hopefully be
+already available and it will not have to wait for the transfer to finish.
+
+The application may want to perform some manual prefetching, for several reasons
+such as excluding initial data transfers from performance measurements, or
+setting up an initial statically-computed data distribution on the machine
+before submitting tasks, which will thus guide StarPU toward an initial task
+distribution (since StarPU will try to avoid further transfers).
+
+This can be achieved by giving the @code{starpu_data_prefetch_on_node} function
+the handle and the desired target memory node.
+
+@node Power-based scheduling
+@section Power-based scheduling
+
+If the application can provide some power performance model (through
+the @code{power_model} field of the codelet structure), StarPU will
+take it into account when distributing tasks. The target function that
+the @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +
+beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}
+is the estimated task consumption in Joules. To tune this parameter, use
+@code{export STARPU_SCHED_GAMMA=3000} for instance, to express that each Joule
+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
+@code{alpha} and @code{beta} to zero permits to only take into account power consumption.
+
+This is however not sufficient to correctly optimize power: the scheduler would
+simply tend to run all computations on the most energy-conservative processing
+unit. To account for the consumption of the whole machine (including idle
+processing units), the idle power of the machine should be given by setting
+@code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can often
+be obtained from the machine power supplier.
+
+The power actually consumed by the total execution can be displayed by setting
+@code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
+
+@node Profiling
+@section Profiling
+
+A quick view of how many tasks each worker has executed can be obtained by setting 
+@code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
+execution did happen on accelerators without penalizing performance with
+the profiling overhead.
+
+A quick view of how much data transfers have been issued can be obtained by setting 
+@code{export STARPU_BUS_STATS=1} .
+
+More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
+calling @code{starpu_profiling_status_set} from the source code.
+Statistics on the execution can then be obtained by using @code{export
+STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
+ More details on performance feedback are provided by the next chapter.
+
+@node CUDA-specific optimizations
+@section CUDA-specific optimizations
+
+Due to CUDA limitations, StarPU will have a hard time overlapping its own
+communications and the codelet computations if the application does not use a
+dedicated CUDA stream for its computations. StarPU provides one by the use of
+@code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
+operations. For instance:
+
+@cartouche
+@smallexample
+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaStreamSynchronize(starpu_cuda_get_local_stream());
+@end smallexample
+@end cartouche
+
+StarPU already does appropriate calls for the CUBLAS library.
+
+Unfortunately, some CUDA libraries do not have stream variants of
+kernels. That will lower the potential for overlapping.
+
+@node Performance debugging
+@section Performance debugging
+
+To get an idea of what is happening, a lot of performance feedback is available,
+detailed in the next chapter. The various informations should be checked for.
+
+@itemize
+@item What does the Gantt diagram look like? (see @ref{Gantt diagram})
+@itemize
+  @item If it's mostly green (running tasks), then the machine is properly
+  utilized, and perhaps the codelets are just slow. Check their performance, see
+  @ref{Codelet performance}.
+  @item If it's mostly purple (FetchingInput), tasks keep waiting for data
+  transfers, do you perhaps have far more communication than computation? Did
+  you properly use CUDA streams to make sure communication can be
+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
+  much as possible?
+  @item If it's mostly red (Blocked), tasks keep waiting for dependencies,
+  do you have enough parallelism? It might be a good idea to check what the DAG
+  looks like (see @ref{DAG}).
+  @item If only some workers are completely red (Blocked), for some reason the
+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
+  check it (see @ref{Codelet performance}). Do all your codelets have a
+  performance model?  When some of them don't, the schedulers switches to a
+  greedy algorithm which thus performs badly.
+@end itemize
+@end itemize

+ 48 - 0
doc/chapters/scaling-vector-example.texi

@@ -0,0 +1,48 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Main application::            
+* CPU Kernel::                 
+* CUDA Kernel::                
+* OpenCL Kernel::              
+@end menu
+
+@node Main application
+@section Main application
+
+@include chapters/vector_scal_c.texi
+
+@node CPU Kernel
+@section CPU Kernel
+
+@include chapters/vector_scal_cpu.texi
+
+@node CUDA Kernel
+@section CUDA Kernel
+
+@include chapters/vector_scal_cuda.texi
+
+@node OpenCL Kernel
+@section OpenCL Kernel
+
+@menu
+* Invoking the kernel::         
+* Source of the kernel::        
+@end menu
+
+@node Invoking the kernel
+@subsection Invoking the kernel
+
+@include chapters/vector_scal_opencl.texi
+
+@node Source of the kernel
+@subsection Source of the kernel
+
+@include chapters/vector_scal_opencl_codelet.texi
+

+ 25 - 0
doc/chapters/socl.texi

@@ -0,0 +1,25 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2012  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+SOCL is an extension that aims at implementing the OpenCL standard on
+top of StarPU. It allows to gives a (relatively) clean and
+standardized API to StarPU.
+By allowing OpenCL applications to use StarPU transparently, it
+provides users with the latest StarPU enhancements without any further
+development, and allows these OpenCL applications to easily fall back
+to another OpenCL implementation.
+
+This section does not require detailed knowledge of the StarPU
+library.
+
+Note: as of StarPU @value{VERSION}, this is still an area under
+development and subject to change.
+
+TODO
+
+
+
+

+ 79 - 0
doc/chapters/tips-tricks.texi

@@ -0,0 +1,79 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Per-worker library initialization::  How to initialize a computation library once for each worker?
+@end menu
+
+@node Per-worker library initialization
+@section How to initialize a computation library once for each worker?
+
+Some libraries need to be initialized once for each concurrent instance that
+may run on the machine. For instance, a C++ computation class which is not
+thread-safe by itself, but for which several instanciated objects of that class
+can be used concurrently. This can be used in StarPU by initializing one such
+object per worker. For instance, the libstarpufft example does the following to
+be able to use FFTW.
+
+Some global array stores the instanciated objects:
+
+@cartouche
+@smallexample
+fftw_plan plan_cpu[STARPU_NMAXWORKERS];
+@end smallexample
+@end cartouche
+
+At initialisation time of libstarpu, the objects are initialized:
+
+@cartouche
+@smallexample
+int workerid;
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) @{
+    switch (starpu_worker_get_type(workerid)) @{
+        case STARPU_CPU_WORKER:
+            plan_cpu[workerid] = fftw_plan(...);
+            break;
+    @}
+@}
+@end smallexample
+@end cartouche
+
+And in the codelet body, they are used:
+
+@cartouche
+@smallexample
+static void fft(void *descr[], void *_args)
+@{
+    int workerid = starpu_worker_get_id();
+    fftw_plan plan = plan_cpu[workerid];
+    ...
+
+    fftw_execute(plan, ...);
+@}
+@end smallexample
+@end cartouche
+
+Another way to go which may be needed is to execute some code from the workers
+themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
+by CUDA to behave properly due to threading issues. For instance, StarPU's
+@code{starpu_helper_cublas_init} looks like the following to call
+@code{cublasInit} from the workers themselves:
+
+@cartouche
+@smallexample
+static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
+@{
+    cublasStatus cublasst = cublasInit();
+    cublasSetKernelStream(starpu_cuda_get_local_stream());
+@}
+void starpu_helper_cublas_init(void)
+@{
+    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
+@}
+@end smallexample
+@end cartouche

+ 113 - 0
doc/chapters/using.texi

@@ -0,0 +1,113 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+@c See the file starpu.texi for copying conditions.
+
+@menu
+* Setting flags for compiling and linking applications::  
+* Running a basic StarPU application::  
+* Kernel threads started by StarPU::
+* Enabling OpenCL::
+@end menu
+
+@node Setting flags for compiling and linking applications
+@section Setting flags for compiling and linking applications
+
+Compiling and linking an application against StarPU may require to use
+specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
+To this end, it is possible to use the @code{pkg-config} tool.
+
+If StarPU was not installed at some standard location, the path of StarPU's
+library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
+that @code{pkg-config} can find it. For example if StarPU was installed in
+@code{$prefix_dir}:
+
+@example
+% PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
+@end example
+
+The flags required to compile or link against StarPU are then
+accessible with the following commands@footnote{It is still possible to use the API
+provided in the version 0.9 of StarPU by calling @code{pkg-config}
+with the @code{libstarpu} package. Similar packages are provided for
+@code{libstarpumpi} and @code{libstarpufft}.}:
+
+@example
+% pkg-config --cflags starpu-1.0  # options for the compiler
+% pkg-config --libs starpu-1.0    # options for the linker
+@end example
+
+Also pass the @code{--static} option if the application is to be
+linked statically.
+
+@node Running a basic StarPU application
+@section Running a basic StarPU application
+
+Basic examples using StarPU are built in the directory
+@code{examples/basic_examples/} (and installed in
+@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
+@code{vector_scal}.
+
+@example
+% ./examples/basic_examples/vector_scal
+BEFORE: First element was 1.000000
+AFTER: First element is 3.140000
+%
+@end example
+
+When StarPU is used for the first time, the directory
+@code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
+that directory (@code{STARPU_HOME} defaults to @code{$HOME})
+
+Please note that buses are benchmarked when StarPU is launched for the
+first time. This may take a few minutes, or less if @code{hwloc} is
+installed. This step is done only once per user and per machine.
+
+@node Kernel threads started by StarPU
+@section Kernel threads started by StarPU
+
+StarPU automatically binds one thread per CPU core. It does not use
+SMT/hyperthreading because kernels are usually already optimized for using a
+full core, and using hyperthreading would make kernel calibration rather random.
+
+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
+
+While StarPU tasks are executing, the application is not supposed to do
+computations in the threads it starts itself, tasks should be used instead.
+
+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
+
+@node Enabling OpenCL
+@section Enabling OpenCL
+
+When both CUDA and OpenCL drivers are enabled, StarPU will launch an
+OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
+This design choice was necessary as OpenCL and CUDA can not run at the
+same time on the same NVIDIA GPU, as there is currently no interoperability
+between them.
+
+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
+
+@example
+% ./configure --disable-cuda
+@end example
+
+or when running applications:
+
+@example
+% STARPU_NCUDA=0 ./application
+@end example
+
+OpenCL will automatically be started on any device not yet used by
+CUDA. So on a machine running 4 GPUS, it is therefore possible to
+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
+so:
+
+@example
+% STARPU_NCUDA=2 ./application
+@end example
+

+ 20 - 10
doc/vector_scal_c.texi

@@ -1,8 +1,16 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+@smallexample
 /*
  * This example demonstrates how to use StarPU to scale an array by a factor.
  * It shows how to manipulate data with StarPU's data management library.
  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
- *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  2- how to describe which data are accessed by a task (task->handles[0])
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
 #include <starpu.h>
@@ -11,22 +19,24 @@
 #define    NX    2048
 
 extern void scal_cpu_func(void *buffers[], void *_args);
+extern void scal_sse_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
-static starpu_codelet cl = @{
+static struct starpu_codelet cl = @{
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
-    .cpu_func = scal_cpu_func,
+    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
 #ifdef STARPU_USE_CUDA
     /* CUDA implementation of the codelet */
-    .cuda_func = scal_cuda_func,
+    .cuda_funcs = @{ scal_cuda_func, NULL @},
 #endif
 #ifdef STARPU_USE_OPENCL
     /* OpenCL implementation of the codelet */
-    .opencl_func = scal_opencl_func,
+    .opencl_funcs = @{ scal_opencl_func, NULL @},
 #endif
-    .nbuffers = 1
+    .nbuffers = 1,
+    .modes = @{ STARPU_RW @}
 @};
 
 #ifdef STARPU_USE_OPENCL
@@ -42,7 +52,7 @@ int main(int argc, char **argv)
     for (i = 0; i < NX; i++)
         vector[i] = 1.0f;
 
-    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
+    fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
 
     /* Initialize StarPU with default configuration */
     starpu_init(NULL);
@@ -65,7 +75,7 @@ int main(int argc, char **argv)
      *  - the fourth argument is the number of elements in the vector
      *  - the fifth argument is the size of each element.
      */
-    starpu_data_handle vector_handle;
+    starpu_data_handle_t vector_handle;
     starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
                                 NX, sizeof(vector[0]));
 
@@ -79,8 +89,7 @@ int main(int argc, char **argv)
     task->cl = &cl;
 
     /* the codelet manipulates one buffer in RW mode */
-    task->buffers[0].handle = vector_handle;
-    task->buffers[0].mode = STARPU_RW;
+    task->handles[0] = vector_handle;
 
     /* an argument is passed to the codelet, beware that this is a
      * READ-ONLY buffer and that the codelet may be given a pointer to a
@@ -106,3 +115,4 @@ int main(int argc, char **argv)
 
     return 0;
 @}
+@end smallexample

+ 68 - 0
doc/chapters/vector_scal_cpu.texi

@@ -0,0 +1,68 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+@smallexample
+#include <starpu.h>
+#include <xmmintrin.h>
+
+/* This kernel takes a buffer and scales it by a constant factor */
+void scal_cpu_func(void *buffers[], void *cl_arg)
+@{
+    unsigned i;
+    float *factor = cl_arg;
+
+    /*
+     * The "buffers" array matches the task->handles array: for instance
+     * task->handles[0] is a handle that corresponds to a data with
+     * vector "interface", so that the first entry of the array in the
+     * codelet  is a pointer to a structure describing such a vector (ie.
+     * struct starpu_vector_interface *). Here, we therefore manipulate
+     * the buffers[0] element as a vector: nx gives the number of elements
+     * in the array, ptr gives the location of the array (that was possibly
+     * migrated/replicated), and elemsize gives the size of each elements.
+     */
+    struct starpu_vector_interface *vector = buffers[0];
+
+    /* length of the vector */
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+
+    /* get a pointer to the local copy of the vector: note that we have to
+     * cast it in (float *) since a vector could contain any type of
+     * elements so that the .ptr field is actually a uintptr_t */
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* scale the vector */
+    for (i = 0; i < n; i++)
+        val[i] *= *factor;
+@}
+
+void scal_sse_func(void *buffers[], void *cl_arg)
+@{
+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+    unsigned int n_iterations = n/4;
+
+    __m128 *VECTOR = (__m128*) vector;
+    __m128 FACTOR __attribute__((aligned(16)));
+    float factor = *(float *) cl_arg;
+    FACTOR = _mm_set1_ps(factor);
+
+    unsigned int i;	
+    for (i = 0; i < n_iterations; i++)
+        VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
+
+    unsigned int remainder = n%4;
+    if (remainder != 0)
+    @{
+        unsigned int start = 4 * n_iterations;
+        for (i = start; i < start+remainder; ++i)
+        @{
+            vector[i] = factor * vector[i];
+        @}
+    @}
+@}
+@end smallexample

+ 9 - 0
doc/vector_scal_cuda.texi

@@ -1,3 +1,11 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+@smallexample
 #include <starpu.h>
 #include <starpu_cuda.h>
 
@@ -24,3 +32,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 @}
+@end smallexample

+ 10 - 1
doc/vector_scal_opencl.texi

@@ -1,3 +1,11 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+@smallexample
 #include <starpu.h>
 #include <starpu_opencl.h>
 
@@ -14,7 +22,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);
@@ -51,3 +59,4 @@ void scal_opencl_func(void *buffers[], void *_args)
 
     starpu_opencl_release_kernel(kernel);
 @}
+@end smallexample

+ 16 - 0
doc/chapters/vector_scal_opencl_codelet.texi

@@ -0,0 +1,16 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+@c See the file starpu.texi for copying conditions.
+
+@smallexample
+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
+@{
+        const int i = get_global_id(0);
+        if (i < nx) @{
+                val[i] *= factor;
+        @}
+@}
+@end smallexample

+ 12 - 0
doc/starpu.css

@@ -1,3 +1,15 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * Permission is granted to copy, distribute and/or modify this document
+ * under the terms of the GNU Free Documentation License, Version 1.3
+ * or any later version published by the Free Software Foundation;
+ * with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+ * See the GNU Free Documentation License in COPYING.GFDL for more details.
+ */
+
 body {
 	font-size: 13px;
 /*	margin-top: 0px; */

File diff suppressed because it is too large
+ 135 - 4947
doc/starpu.texi


+ 29 - 15
doc/tutorial/Makefile

@@ -1,25 +1,39 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
+# Redistribution  and  use  in  source and binary forms, with or without
+# modification,  are  permitted  provided  that the following conditions
+# are met:
 #
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# * Redistributions  of  source  code  must  retain  the above copyright
+#   notice,  this  list  of  conditions  and  the  following  disclaimer.
+# * Redistributions  in  binary  form must reproduce the above copyright
+#   notice,  this list of conditions and the following disclaimer in the
+#   documentation  and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+#   derived from this software without specific prior written permission.
 #
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-CFLAGS          +=      $$(pkg-config --cflags libstarpu)
-LDFLAGS         +=      $$(pkg-config --libs libstarpu)
-
-HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
+# THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+# SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+# LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+# DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+CFLAGS          +=      $$(pkg-config --cflags libstarpu-1.0)
+LDFLAGS         +=      $$(pkg-config --libs libstarpu-1.0)
+
+HAS_CUDA	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i cuda)
 NVCC		?=	nvcc
-HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i opencl)
 
 %.o: %.cu
 	nvcc $(CFLAGS) $< -c

+ 21 - 8
doc/tutorial/README

@@ -3,16 +3,29 @@
 # Copyright (C) 2009-2011  Université de Bordeaux 1
 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
 #
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
+# Redistribution  and  use  in  source and binary forms, with or without
+# modification,  are  permitted  provided  that the following conditions
+# are met:
 #
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# * Redistributions  of  source  code  must  retain  the above copyright
+#   notice,  this  list  of  conditions  and  the  following  disclaimer.
+# * Redistributions  in  binary  form must reproduce the above copyright
+#   notice,  this list of conditions and the following disclaimer in the
+#   documentation  and/or other materials provided with the distribution.
+# * The name of the author may not be used to endorse or promote products
+#   derived from this software without specific prior written permission.
 #
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+# THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+# SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+# LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+# DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 Instructions on how to compile and run StarPU examples

+ 23 - 10
doc/tutorial/hello_world.c

@@ -3,16 +3,29 @@
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <starpu.h>
@@ -29,10 +42,10 @@ void cpu_func(void *buffers[], void *cl_arg)
     printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 
-starpu_codelet cl =
+struct starpu_codelet cl =
 {
     .where = STARPU_CPU,
-    .cpu_func = cpu_func,
+    .cpu_funcs = {cpu_func, NULL},
     .nbuffers = 0
 };
 

+ 31 - 18
doc/tutorial/vector_scal.c

@@ -1,25 +1,38 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * This example demonstrates how to use StarPU to scale an array by a factor.
  * It shows how to manipulate data with StarPU's data management library.
  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
- *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  2- how to describe which data are accessed by a task (task->handle[0])
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
 #include <starpu.h>
@@ -31,19 +44,20 @@ extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl = {
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
-    .cpu_func = scal_cpu_func,
+    .cpu_funcs = {scal_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
     /* CUDA implementation of the codelet */
-    .cuda_func = scal_cuda_func,
+    .cuda_funcs = {scal_cuda_func, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
     /* OpenCL implementation of the codelet */
-    .opencl_func = scal_opencl_func,
+    .opencl_funcs = {scal_opencl_func, NULL},
 #endif
-    .nbuffers = 1
+    .nbuffers = 1,
+    .modes = {STARPU_RW}
 };
 
 #ifdef STARPU_USE_OPENCL
@@ -81,7 +95,7 @@ int main(int argc, char **argv)
      *  - the fourth argument is the number of elements in the vector
      *  - the fifth argument is the size of each element.
      */
-    starpu_data_handle vector_handle;
+    starpu_data_handle_t vector_handle;
     starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
                                 NX, sizeof(vector[0]));
 
@@ -95,8 +109,7 @@ int main(int argc, char **argv)
     task->cl = &cl;
 
     /* the codelet manipulates one buffer in RW mode */
-    task->buffers[0].handle = vector_handle;
-    task->buffers[0].mode = STARPU_RW;
+    task->handles[0] = vector_handle;
 
     /* an argument is passed to the codelet, beware that this is a
      * READ-ONLY buffer and that the codelet may be given a pointer to a

+ 26 - 13
doc/tutorial/vector_scal_cpu.c

@@ -1,18 +1,31 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <starpu.h>
@@ -24,16 +37,16 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
     float *factor = cl_arg;
 
     /*
-     * The "buffers" array matches the task->buffers array: for instance
-     * task->buffers[0].handle is a handle that corresponds to a data with
+     * The "buffers" array matches the task->handles array: for instance
+     * task->handles[0] is a handle that corresponds to a data with
      * vector "interface", so that the first entry of the array in the
      * codelet  is a pointer to a structure describing such a vector (ie.
-     * struct starpu_vector_interface_s *). Here, we therefore manipulate
+     * struct starpu_vector_interface *). Here, we therefore manipulate
      * the buffers[0] element as a vector: nx gives the number of elements
      * in the array, ptr gives the location of the array (that was possibly
      * migrated/replicated), and elemsize gives the size of each elements.
      */
-    starpu_vector_interface_t *vector = buffers[0];
+    struct starpu_vector_interface *vector = buffers[0];
 
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(vector);

+ 21 - 8
doc/tutorial/vector_scal_cuda.cu

@@ -3,16 +3,29 @@
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <starpu.h>

+ 22 - 9
doc/tutorial/vector_scal_opencl.c

@@ -3,16 +3,29 @@
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <starpu.h>
@@ -31,7 +44,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
     id = starpu_worker_get_id();
     devid = starpu_worker_get_devid(id);

+ 21 - 8
doc/tutorial/vector_scal_opencl_kernel.cl

@@ -3,16 +3,29 @@
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
+ * Redistribution  and  use  in  source and binary forms, with or without
+ * modification,  are  permitted  provided  that the following conditions
+ * are met:
  *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * * Redistributions  of  source  code  must  retain  the above copyright
+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
+ * * Redistributions  in  binary  form must reproduce the above copyright
+ *   notice,  this list of conditions and the following disclaimer in the
+ *   documentation  and/or other materials provided with the distribution.
+ * * The name of the author may not be used to endorse or promote products
+ *   derived from this software without specific prior written permission.
  *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 __kernel void vector_mult_opencl(__global float* val, int nx, float factor)

+ 0 - 32
doc/vector_scal_cpu.texi

@@ -1,32 +0,0 @@
-#include <starpu.h>
-
-/* This kernel takes a buffer and scales it by a constant factor */
-void scal_cpu_func(void *buffers[], void *cl_arg)
-@{
-    unsigned i;
-    float *factor = cl_arg;
-
-    /* 
-     * The "buffers" array matches the task->buffers array: for instance
-     * task->buffers[0].handle is a handle that corresponds to a data with
-     * vector "interface", so that the first entry of the array in the
-     * codelet  is a pointer to a structure describing such a vector (ie.
-     * struct starpu_vector_interface_s *). Here, we therefore manipulate
-     * the buffers[0] element as a vector: nx gives the number of elements
-     * in the array, ptr gives the location of the array (that was possibly
-     * migrated/replicated), and elemsize gives the size of each elements.
-     */
-    starpu_vector_interface_t *vector = buffers[0];
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
-
-    /* get a pointer to the local copy of the vector : note that we have to
-     * cast it in (float *) since a vector could contain any type of
-     * elements so that the .ptr field is actually a uintptr_t */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-
-    /* scale the vector */
-    for (i = 0; i < n; i++)
-        val[i] *= *factor;
-@}

+ 0 - 7
doc/vector_scal_opencl_codelet.texi

@@ -1,7 +0,0 @@
-__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
-@{
-        const int i = get_global_id(0);
-        if (i < nx) @{
-                val[i] *= factor;
-        @}
-@}

+ 118 - 23
examples/Makefile.am

@@ -1,9 +1,9 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011  INRIA
+# Copyright (C) 2011-2012  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -17,7 +17,8 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
+AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
@@ -25,17 +26,6 @@ AUTOMAKE_OPTIONS = subdir-objects
 
 SUBDIRS = stencil
 
-
-if STARPU_USE_SOCL
-SUBDIRS += socl
-endif
-
-if STARPU_HAVE_FFTW
-if STARPU_HAVE_FFTWF
-SUBDIRS += starpufft
-endif
-endif
-
 BUILT_SOURCES =
 
 if STARPU_USE_OPENCL
@@ -44,6 +34,8 @@ endif
 
 EXTRA_DIST = 					\
 	basic_examples/vector_scal_opencl_kernel.cl \
+	basic_examples/multiformat_opencl_kernel.cl  \
+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl \
 	common/blas_model.c			\
 	spmv/spmv_cuda.cu			\
 	spmv/spmv_opencl.cl			\
@@ -61,7 +53,9 @@ EXTRA_DIST = 					\
 	matvecmult/matvecmult_kernel.cl				\
 	basic_examples/block_opencl_kernel.cl			\
 	openmp/vector_scal.c			\
-	filters/fblock_opencl_kernel.cl
+	filters/fblock_opencl_kernel.cl		\
+	filters/custom_mf/conversion_opencl.cl  \
+	filters/custom_mf/custom_opencl.cl
 
 CLEANFILES = 					\
 	gordon/null_kernel_gordon.spuelf
@@ -99,6 +93,12 @@ BUILT_SOURCES +=				\
 
 endif
 
+if STARPU_HAVE_ICC
+.icc.o:
+	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+		$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $< -c -o $@
+endif
+
 examplebindir = $(libdir)/starpu/examples/
 
 examplebin_PROGRAMS =
@@ -129,7 +129,11 @@ noinst_HEADERS = 				\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
 	spmv/spmv.h				\
-	spmv/dw_block_spmv.h
+	spmv/dw_block_spmv.h                    \
+	basic_examples/multiformat_types.h      \
+	filters/custom_mf/custom_interface.h    \
+	filters/custom_mf/custom_types.h	\
+	interface/complex_interface.h
 
 #####################################
 # What to install and what to check #
@@ -147,9 +151,10 @@ endif
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 LOADER			=	loader
+loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
-TESTS_ENVIRONMENT	=	$(LOADER_BIN)
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
 endif
 
 examplebin_PROGRAMS +=				\
@@ -158,17 +163,21 @@ examplebin_PROGRAMS +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
-	basic_examples/mult_impl                \
+	basic_examples/multiformat              \
+	cpp/incrementer_cpp			\
+	filters/custom_mf/custom_mf_filter      \
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
 	tag_example/tag_example			\
-	tag_example/tag_example3		\
 	tag_example/tag_example2		\
+	tag_example/tag_example3		\
+	tag_example/tag_example4		\
 	tag_example/tag_restartable		\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
+	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	reductions/dot_product			\
@@ -218,16 +227,20 @@ STARPU_EXAMPLES +=				\
 	basic_examples/mult			\
 	basic_examples/block			\
 	basic_examples/variable			\
+	basic_examples/multiformat              \
+	cpp/incrementer_cpp			\
 	filters/fvector				\
 	filters/fblock				\
 	filters/fmatrix				\
 	tag_example/tag_example			\
-	tag_example/tag_example3		\
 	tag_example/tag_example2		\
+	tag_example/tag_example3		\
+	tag_example/tag_example4		\
 	tag_example/tag_restartable		\
 	spmv/spmv				\
 	callback/callback			\
 	incrementer/incrementer			\
+	interface/complex			\
 	matvecmult/matvecmult			\
 	profiling/profiling			\
 	scheduler/dummy_sched			\
@@ -277,6 +290,12 @@ basic_examples_vector_scal_SOURCES =		\
 	basic_examples/vector_scal.c		\
 	basic_examples/vector_scal_cpu.c
 
+if STARPU_HAVE_ICC
+basic_examples_vector_scal_SOURCES +=		\
+	basic_examples/vector_scal_cpu_icc.icc
+basic_examples/vector_scal_cpu_icc.o: CFLAGS += -Dscal_cpu_func=scal_cpu_func_icc -Dscal_sse_func=scal_sse_func_icc
+endif
+
 if STARPU_USE_CUDA
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cuda.cu
@@ -303,6 +322,29 @@ basic_examples_vector_scal_fortran_LDADD =	\
 endif
 endif
 
+#######################
+# Multiformat example #
+#######################
+basic_examples_multiformat_SOURCES =                                    \
+	basic_examples/multiformat.c                                    \
+	basic_examples/multiformat_conversion_codelets.c
+
+if STARPU_USE_CUDA
+basic_examples_multiformat_SOURCES+=                                     \
+	basic_examples/multiformat_cuda.cu                               \
+	basic_examples/multiformat_conversion_codelets_cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+basic_examples_multiformat_SOURCES+=                                     \
+	basic_examples/multiformat_opencl.c                              \
+	basic_examples/multiformat_conversion_codelets_opencl.c          
+
+nobase_STARPU_OPENCL_DATA_DATA+=                                         \
+	basic_examples/multiformat_opencl_kernel.cl                      \
+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl
+endif
+
 #################
 # block example #
 #################
@@ -362,6 +404,30 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	filters/fblock_opencl_kernel.cl
 endif
 
+
+#############################
+# Custom multiformat filter #
+#############################
+filters_custom_mf_custom_mf_filter_SOURCES=\
+	filters/custom_mf/custom_mf_filter.c \
+	filters/custom_mf/custom_interface.c   \
+	filters/custom_mf/custom_conversion_codelets.c
+
+if STARPU_USE_CUDA
+filters_custom_mf_custom_mf_filter_SOURCES+=\
+	filters/custom_mf/conversion.cu \
+	filters/custom_mf/cuda.cu
+endif
+
+if STARPU_USE_OPENCL
+filters_custom_mf_custom_mf_filter_SOURCES+=\
+	filters/custom_mf/conversion_opencl.c \
+	filters/custom_mf/custom_opencl.c
+nobase_STARPU_OPENCL_DATA_DATA += \
+	filters/custom_mf/conversion_opencl.cl \
+	filters/custom_mf/custom_opencl.cl
+endif
+
 ################
 # AXPY example #
 ################
@@ -606,11 +672,22 @@ spmv_dw_block_spmv_LDADD =			\
 	$(STARPU_BLAS_LDFLAGS)
 endif
 
+###########################
+# C++ Incrementer example #
+###########################
+
+cpp_incrementer_cpp_SOURCES	=	\
+	cpp/incrementer_cpp.cpp
+
+#if STARPU_USE_CUDA
+#cpp_incrementer_cpp_SOURCES +=	\
+#	incrementer/incrementer_kernels.cu
+#endif
+
 #######################
 # Incrementer example #
 #######################
 
-
 incrementer_incrementer_SOURCES =	\
 	incrementer/incrementer.c
 if STARPU_USE_CUDA
@@ -624,6 +701,18 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 	incrementer/incrementer_kernels_opencl_kernel.cl
 endif
 
+#####################
+# interface example #
+#####################
+
+interface_complex_SOURCES	=	\
+	interface/complex.c		\
+	interface/complex_interface.c
+if STARPU_USE_CUDA
+interface_complex_SOURCES	+=	\
+	interface/complex_kernels.cu
+endif
+
 ######################
 # matVecMult example #
 ######################
@@ -651,7 +740,7 @@ endif
 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
 if HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
-mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
 endif
 
 ################
@@ -663,3 +752,9 @@ examplebin_PROGRAMS +=				\
 
 top_hello_world_top_SOURCES =			\
 	top/hello_world_top.c
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 16 - 0
examples/audio/Makefile

@@ -1,3 +1,19 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
 CFLAGS += -Wall -g3 -gdwarf-2 -O3 
 
 LIBS+=$$(pkg-config --libs libstarpu) -lcufft

+ 39 - 22
examples/audio/starpu_audio_processing.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,7 +57,7 @@ unsigned length_data;
 /* buffer containing input WAV data */
 float *A;
 
-starpu_data_handle A_handle;
+starpu_data_handle_t A_handle;
 
 /* For performance evaluation */
 static struct timeval start;
@@ -101,7 +101,8 @@ void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_fil
 	/* we skip the header to only keep the data */
 	fseek(infile, headersize, SEEK_SET);
 	
-	for (v=0;v<size;v++) {
+	for (v=0;v<size;v++)
+	{
 		signed char val = (signed char)fgetc(infile);
 		signed char val2 = (signed char)fgetc(infile);
 
@@ -124,7 +125,8 @@ void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_fi
 	/* we assume that the header is copied using copy_wav_header */
 	fseek(outfile, headersize, SEEK_SET);
 	
-	for (v=0;v<size;v++) {
+	for (v=0;v<size;v++)
+	{
 		signed char val = ((int)arrayin[v]) % 256; 
 		signed char val2  = ((int)arrayin[v]) / 256;
 
@@ -146,7 +148,8 @@ void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_fi
  */
 
 /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
-typedef struct {
+typedef struct
+{
 	unsigned is_initialized;
 #ifdef STARPU_USE_CUDA
 	cufftHandle plan;
@@ -268,17 +271,20 @@ static void band_filter_kernel_cpu(void *descr[], __attribute__((unused)) void *
 		localA[i] /= nsamples;
 }
 
-struct starpu_perfmodel_t band_filter_model = {
+struct starpu_perfmodel band_filter_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "FFT_band_filter"
 };
 
-static starpu_codelet band_filter_cl = {
+static struct starpu_codelet band_filter_cl =
+{
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = band_filter_kernel_gpu,
+	.cuda_funcs = {band_filter_kernel_gpu, NULL},
 #endif
-	.cpu_func = band_filter_kernel_cpu,
+	.cpu_funcs = {band_filter_kernel_cpu, NULL},
 	.model = &band_filter_model,
 	.nbuffers = 1
 };
@@ -292,17 +298,18 @@ void callback(void *arg)
 
 void create_starpu_task(unsigned iter)
 {
+	int ret;
 	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &band_filter_cl;
 
-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, iter);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(A_handle, 1, iter);
 
 	task->callback_func = callback;
 	task->callback_arg = NULL;
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 static void init_problem(void)
@@ -330,7 +337,8 @@ static void init_problem(void)
 	{
 		starpu_malloc((void **)&A, length_data*sizeof(float));
 	}
-	else {
+	else
+	{
 		A = malloc(length_data*sizeof(float));
 	}
 
@@ -344,31 +352,38 @@ static void init_problem(void)
 static void parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-h") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-h") == 0)
+		{
 			fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
 			exit(-1);
 		}
 
-		if (strcmp(argv[i], "-i") == 0) {
+		if (strcmp(argv[i], "-i") == 0)
+		{
 			inputfilename = argv[++i];;
 		}
 
-		if (strcmp(argv[i], "-o") == 0) {
+		if (strcmp(argv[i], "-o") == 0)
+		{
 			outputfilename = argv[++i];;
 		}
 
-		if (strcmp(argv[i], "-no-output") == 0) {
+		if (strcmp(argv[i], "-no-output") == 0)
+		{
 			outputfilename = NULL;;
 		}
 
 		/* block size */
-		if (strcmp(argv[i], "-nsamples") == 0) {
+		if (strcmp(argv[i], "-nsamples") == 0)
+		{
 			char *argptr;
 			nsamples = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-pin") == 0) {
+		if (strcmp(argv[i], "-pin") == 0)
+		{
 			use_pin = 1;
 		}
 	}
@@ -377,6 +392,7 @@ static void parse_args(int argc, char **argv)
 int main(int argc, char **argv)
 {
 	unsigned iter;
+	int ret;
 
 	parse_args(argc, argv);
 
@@ -389,11 +405,12 @@ int main(int argc, char **argv)
 	fprintf(stderr, "input: %s\noutput: %s\n#chunks %d\n", inputfilename, outputfilename, niter);
 
 	/* launch StarPU */
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 
-	struct starpu_data_filter f = 
+	struct starpu_data_filter f =
 	{
 		.filter_func = starpu_block_filter_func_vector,
 		.nchildren = niter

+ 23 - 17
examples/axpy/axpy.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,7 +41,7 @@
 TYPE *vec_x, *vec_y;
 
 /* descriptors for StarPU */
-starpu_data_handle handle_y, handle_x;
+starpu_data_handle_t handle_y, handle_x;
 
 void axpy_cpu(void *descr[], __attribute__((unused)) void *arg)
 {
@@ -70,28 +70,35 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 }
 #endif
 
-static starpu_codelet axpy_cl = {
+static struct starpu_codelet axpy_cl =
+{
         .where =
 #ifdef STARPU_USE_CUDA
                 STARPU_CUDA|
 #endif
                 STARPU_CPU,
 
-	.cpu_func = axpy_cpu,
+	.cpu_funcs = {axpy_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = axpy_gpu,
+	.cuda_funcs = {axpy_gpu, NULL},
 #endif
-	.nbuffers = 2
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW}
 };
 
 int main(int argc, char **argv)
 {
+	int ret;
+
 	/* Initialize StarPU */
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_helper_cublas_init();
 
-	/* This is equivalent to 
+	/* This is equivalent to
 		vec_a = malloc(N*sizeof(TYPE));
 		vec_b = malloc(N*sizeof(TYPE));
 	*/
@@ -116,7 +123,8 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&handle_y, 0, (uintptr_t)vec_y, N, sizeof(TYPE));
 
 	/* Divide the vector into blocks */
-	struct starpu_data_filter block_filter = {
+	struct starpu_data_filter block_filter =
+	{
 		.filter_func = starpu_block_filter_func_vector,
 		.nchildren = NBLOCKS
 	};
@@ -128,7 +136,7 @@ int main(int argc, char **argv)
 
 	struct timeval start;
 	struct timeval end;
-	
+
 	gettimeofday(&start, NULL);
 
 	unsigned b;
@@ -140,13 +148,11 @@ int main(int argc, char **argv)
 
 		task->cl_arg = &alpha;
 
-		task->buffers[0].handle = starpu_data_get_sub_data(handle_x, 1, b);
-		task->buffers[0].mode = STARPU_R;
-		
-		task->buffers[1].handle = starpu_data_get_sub_data(handle_y, 1, b);
-		task->buffers[1].mode = STARPU_RW;
-		
-		starpu_task_submit(task);
+		task->handles[0] = starpu_data_get_sub_data(handle_x, 1, b);
+		task->handles[1] = starpu_data_get_sub_data(handle_y, 1, b);
+
+		ret = starpu_task_submit(task);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	}
 
 	starpu_task_wait_for_all();

+ 32 - 21
examples/basic_examples/block.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,29 +35,30 @@ typedef void (*device_func)(void **, void *);
 
 int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
 {
-	starpu_codelet cl = {};
-	starpu_data_handle block_handle;
+	struct starpu_codelet cl = {};
+	starpu_data_handle_t block_handle;
         int i;
 
 	starpu_block_data_register(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
 
 	cl.where = where;
-        cl.cuda_func = func;
-        cl.cpu_func = func;
-        cl.opencl_func = func;
+        cl.cuda_funcs[0] = func;
+        cl.cpu_funcs[0] = func;
+        cl.opencl_funcs[0] = func;
         cl.nbuffers = 1;
+	cl.modes[0] = STARPU_RW,
         cl.model = NULL;
 
         struct starpu_task *task = starpu_task_create();
         task->cl = &cl;
         task->callback_func = NULL;
-        task->buffers[0].handle = block_handle;
-        task->buffers[0].mode = STARPU_RW;
+        task->handles[0] = block_handle;
 	task->cl_arg = &multiplier;
 	task->cl_arg_size = sizeof(multiplier);
 
         int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 return 1;
 	}
@@ -67,8 +68,9 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 	/* update the array in RAM */
 	starpu_data_unregister(block_handle);
 
-        for(i=0 ; i<pnx*pny*pnz; i++) {
-          FPRINTF(stderr, "%f ", block[i]);
+        for(i=0 ; i<pnx*pny*pnz; i++)
+	{
+		FPRINTF(stderr, "%f ", block[i]);
         }
         FPRINTF(stderr, "\n");
 
@@ -84,13 +86,19 @@ int main(int argc, char **argv)
         int nz=4;
         float multiplier=1.0;
 
-        starpu_init(NULL);
+        ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
         block = (float*)malloc(nx*ny*nz*sizeof(float));
         assert(block);
-        for(k=0 ; k<nz ; k++) {
-                for(j=0 ; j<ny ; j++) {
-                        for(i=0 ; i<nx ; i++) {
+        for(k=0 ; k<nz ; k++)
+	{
+                for(j=0 ; j<ny ; j++)
+		{
+                        for(i=0 ; i<nx ; i++)
+			{
                                 block[(k*nx*ny)+(j*nx)+i] = n++;
                         }
                 }
@@ -99,7 +107,8 @@ int main(int argc, char **argv)
         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
         if (!ret) multiplier *= 1.0;
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
+        ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
         if (!ret) multiplier *= 2.0;
 #endif
@@ -110,11 +119,13 @@ int main(int argc, char **argv)
 
         /* Check result is correct */
         ret=1;
-        for(i=0 ; i<nx*ny*nz ; i++) {
-          if (block[i] != (i+1) * multiplier) {
-            ret=0;
-            break;
-          }
+        for(i=0 ; i<nx*ny*nz ; i++)
+	{
+		if (block[i] != (i+1) * multiplier)
+		{
+			ret=0;
+			break;
+		}
         }
 
         FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");

+ 5 - 3
examples/basic_examples/block_cpu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -28,8 +28,10 @@ void cpu_codelet(void *descr[], void *_args)
         float *multiplier = (float *)_args;
         unsigned i, j, k;
 
-        for(k=0; k<nz ; k++) {
-                for(j=0; j<ny ; j++) {
+        for(k=0; k<nz ; k++)
+	{
+                for(j=0; j<ny ; j++)
+		{
                         for(i=0; i<nx ; i++)
                                 block[(k*ldz)+(j*ldy)+i] *= *multiplier;
                 }

+ 5 - 3
examples/basic_examples/block_cuda.cu

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,8 +20,10 @@
 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 {
         int i, j, k;
-        for(k=0; k<nz ; k++) {
-                for(j=0; j<ny ; j++) {
+        for(k=0; k<nz ; k++)
+	{
+                for(j=0; j<ny ; j++)
+		{
                         for(i=0; i<nx ; i++)
                                 block[(k*ldz)+(j*ldy)+i] *= multiplier;
                 }

+ 18 - 11
examples/basic_examples/block_opencl.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,6 +18,15 @@
 #include <starpu.h>
 #include <starpu_opencl.h>
 
+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
+do						    	    \
+{							    \
+	int err;                                            \
+	err = clSetKernelArg(kernel, n, size, ptr);         \
+	if (err != CL_SUCCESS)                              \
+       		STARPU_OPENCL_REPORT_ERROR(err);            \
+} while (0)
+
 extern struct starpu_opencl_program opencl_code;
 
 void opencl_codelet(void *descr[], void *_args)
@@ -26,7 +35,7 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_command_queue queue;
 	cl_event event;
 	int id, devid, err;
-	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
@@ -40,15 +49,13 @@ void opencl_codelet(void *descr[], void *_args)
         err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = 0;
-	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
-	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
-	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
-	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
-	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
-	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
-	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz);
+	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier);
 
 	{
                 size_t global=nx*ny*nz;

+ 5 - 3
examples/basic_examples/block_opencl_kernel.cl

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,8 +17,10 @@
 __kernel void block(__global float *b, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
 {
         int i, j, k;
-        for(k=0; k<nz ; k++) {
-                for(j=0; j<ny ; j++) {
+        for(k=0; k<nz ; k++)
+	{
+                for(j=0; j<ny ; j++)
+		{
                         for(i=0; i<nx ; i++)
                                 b[(k*ldz)+(j*ldy)+i] *= multiplier;
                 }

+ 20 - 11
examples/basic_examples/hello_world.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -46,10 +46,12 @@ void callback_func(void *callback_arg)
  * DSM; the second arguments references read-only data that is passed as an
  * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
  * are no data input/output managed by the DSM (cl.nbuffers = 0) */
-struct params {
+struct params
+{
 	int i;
 	float f;
 };
+
 void cpu_func(void *buffers[], void *cl_arg)
 {
 	struct params *params = (struct params *) cl_arg;
@@ -57,17 +59,21 @@ void cpu_func(void *buffers[], void *cl_arg)
 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
 }
 
-starpu_codelet cl = {};
+struct starpu_codelet cl = {};
 
 int main(int argc, char **argv)
 {
 	struct starpu_task *task;
 	struct params params = {1, 2.0f};
+	int ret;
 
 	/* initialize StarPU : passing a NULL argument means that we use
  	* default configuration for the scheduling policies and the number of
 	* processors/accelerators */
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* create a new task that is non-blocking by default : the task is not
 	 * submitted to the scheduler until the starpu_task_submit function is
@@ -77,7 +83,7 @@ int main(int argc, char **argv)
 	/* this codelet may only be executed on a CPU, and its cpu
  	 * implementation is function "cpu_func" */
 	cl.where = STARPU_CPU;
-	cl.cpu_func = cpu_func;
+	cl.cpu_funcs[0] = cpu_func;
 	/* the codelet does not manipulate any data that is managed
 	 * by our DSM */
 	cl.nbuffers = 0;
@@ -95,7 +101,7 @@ int main(int argc, char **argv)
 	 * argument (cl_arg) is NOT a valid synchronization medium! */
 	task->cl_arg = &params;
 	task->cl_arg_size = sizeof(params);
-		
+
 	/* once the task has been executed, callback_func(0x42)
 	 * will be called on a CPU */
 	task->callback_func = callback_func;
@@ -103,13 +109,12 @@ int main(int argc, char **argv)
 
 	/* starpu_task_submit will be a blocking call */
 	task->synchronous = 1;
-	
+
 	/* submit the task to StarPU */
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
-	/* destroy the task */
-	starpu_task_destroy(task);
-	
 	/* terminate StarPU: statistics and other debug outputs are not
 	 * guaranteed to be generated unless this function is called. Once it
 	 * is called, it is not possible to submit tasks anymore, and the user
@@ -119,4 +124,8 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 	return 0;
+
+enodev:
+	starpu_shutdown();
+	return 77;
 }

+ 49 - 26
examples/basic_examples/mult.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,7 +27,7 @@
  *  - how to unpartition data (starpu_data_unpartition) and how to stop
  *    monitoring data (starpu_data_unregister)
  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
- *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
+ *  - how to construct an autocalibrated performance model (starpu_perfmodel)
  *  - how to submit asynchronous tasks
  */
 
@@ -41,7 +41,7 @@
 #include <starpu.h>
 
 static float *A, *B, *C;
-static starpu_data_handle A_handle, B_handle, C_handle;
+static starpu_data_handle_t A_handle, B_handle, C_handle;
 
 static unsigned nslicesx = 4;
 static unsigned nslicesy = 4;
@@ -133,20 +133,26 @@ static void init_problem_data(void)
 
 	/* fill the A and B matrices */
 	srand(2009);
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < zdim; i++) {
+	for (j=0; j < ydim; j++)
+	{
+		for (i=0; i < zdim; i++)
+		{
 			A[j+i*ydim] = (float)(starpu_drand48());
 		}
 	}
 
-	for (j=0; j < zdim; j++) {
-		for (i=0; i < xdim; i++) {
+	for (j=0; j < zdim; j++)
+	{
+		for (i=0; i < xdim; i++)
+		{
 			B[j+i*zdim] = (float)(starpu_drand48());
 		}
 	}
 
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < xdim; i++) {
+	for (j=0; j < ydim; j++)
+	{
+		for (i=0; i < xdim; i++)
+		{
 			C[j+i*ydim] = (float)(0);
 		}
 	}
@@ -186,16 +192,18 @@ static void partition_mult_data(void)
 	/* StarPU supplies some basic filters such as the partition of a matrix
 	 * into blocks, note that we are using a FORTRAN ordering so that the
 	 * name of the filters are a bit misleading */
-	struct starpu_data_filter vert = {
+	struct starpu_data_filter vert =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nslicesx
 	};
-		
-	struct starpu_data_filter horiz = {
+
+	struct starpu_data_filter horiz =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nslicesy
 	};
-		
+
 /*
  *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
  *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
@@ -246,25 +254,29 @@ static void partition_mult_data(void)
 	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
 }
 
-static struct starpu_perfmodel_t mult_perf_model = {
+static struct starpu_perfmodel mult_perf_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "mult_perf_model"
 };
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
         /* we can only execute that kernel on a CPU yet */
         .where = STARPU_CPU,
         /* CPU implementation of the codelet */
-        .cpu_func = cpu_mult,
+        .cpu_funcs = {cpu_mult, NULL},
         /* the codelet manipulates 3 buffers that are managed by the
          * DSM */
         .nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_W},
         /* in case the scheduling policy may use performance models */
         .model = &mult_perf_model
 };
 
-static void launch_tasks(void)
+static int launch_tasks(void)
 {
+	int ret;
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
@@ -301,10 +313,8 @@ static void launch_tasks(void)
 			 * identified by "tasky" (respectively "taskx). The "1"
 			 * tells StarPU that there is a single argument to the
 			 * variable-arity function starpu_data_get_sub_data */
-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
-			task->buffers[0].mode = STARPU_R;
-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
-			task->buffers[1].mode = STARPU_R;
+			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
+			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
 
 			/* 2 filters were applied on matrix C, so we give
 			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
@@ -315,20 +325,27 @@ static void launch_tasks(void)
 			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
 			 * equivalent to
 			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
-			task->buffers[2].mode = STARPU_W;
+			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
 
 			/* this is not a blocking call since task->synchronous = 0 */
-			starpu_task_submit(task);
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV) return ret;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 	}
+	return 0;
 }
 
 int main(__attribute__ ((unused)) int argc, 
 	 __attribute__ ((unused)) char **argv)
 {
+	int ret;
+
 	/* start the runtime */
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* initialize matrices A, B and C and register them to StarPU */
 	init_problem_data();
@@ -338,7 +355,8 @@ int main(__attribute__ ((unused)) int argc,
 	partition_mult_data();
 
 	/* submit all tasks in an asynchronous fashion */
-	launch_tasks();
+	ret = launch_tasks();
+	if (ret == -ENODEV) goto enodev;
 
 	/* wait for termination */
         starpu_task_wait_for_all();
@@ -367,4 +385,9 @@ int main(__attribute__ ((unused)) int argc,
 	starpu_shutdown();
 
 	return 0;
+
+enodev:
+	starpu_shutdown();
+	return 77;
 }
+

+ 0 - 384
examples/basic_examples/mult_impl.c

@@ -1,384 +0,0 @@
-/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Télécom-SudParis
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <signal.h>
-
-#include <starpu.h>
-
-static float *A, *B, *C;
-static starpu_data_handle A_handle, B_handle, C_handle;
-
-static unsigned nslicesx = 4;
-static unsigned nslicesy = 4;
-static unsigned xdim = 1024;
-static unsigned ydim = 1024;
-static unsigned zdim = 512;
-
-
-double mult_gemm_cost(starpu_buffer_descr *descr)
-{
-	/* C = A * B */
-	uint32_t nxC, nyC, nxA;
-
-
-	nxC = starpu_matrix_get_nx(descr[2].handle);
-	nyC = starpu_matrix_get_ny(descr[2].handle);
-	nxA = starpu_matrix_get_nx(descr[0].handle);
-
-	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
-
-	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
-
-	printf("cost %e \n", cost);
-
-	return cost;
-}
-
-static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
-{
-	float *subA, *subB, *subC;
-	uint32_t nxC, nyC, nyA;
-	uint32_t ldA, ldB, ldC;
-	printf("On application: Hello, this is kernel cpu_mult\n\n");
-	/* .blas.ptr gives a pointer to the first element of the local copy */
-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
-	 * is the number of lines that are separated by .blas.ld elements (ld
-	 * stands for leading dimension).
-	 * NB: in case some filters were used, the leading dimension is not
-	 * guaranteed to be the same in main memory (on the original matrix)
-	 * and on the accelerator! */
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
-
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-	/* we assume a FORTRAN-ordering! */
-	unsigned i,j,k;
-	for (i = 0; i < nyC; i++)
-	{
-		for (j = 0; j < nxC; j++)
-		{
-			float sum = 0.0;
-
-			for (k = 0; k < nyA; k++)
-			{
-				sum += subA[j+k*ldA]*subB[k+i*ldB];
-			}
-
-			subC[j + i*ldC] = sum;
-		}
-	}
-}
-
-static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
-{
-	float *subA, *subB, *subC;
-	uint32_t nxC, nyC, nyA;
-	uint32_t ldA, ldB, ldC;
-	printf("On application: this is kernel cpu_mult_2\n\n");
-	/* .blas.ptr gives a pointer to the first element of the local copy */
-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
-
-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
-
-	/* we assume a FORTRAN-ordering! */
-	unsigned i,j,k;
-	for (j = 0; j < nxC; j++)
-	{
-		for (i = 0; i < nyC; i++)
-		{
-			float sum = 0.0;
-
-			for (k = 0; k < nyA; k++)
-			{
-				sum += subA[j+k*ldA]*subB[k+i*ldB];
-			}
-
-			subC[j + i*ldC] = sum;
-		}
-	}
-}
-
-
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-	/* we initialize matrices A, B and C in the usual way */
-
-	A = malloc(zdim*ydim*sizeof(float));
-	B = malloc(xdim*zdim*sizeof(float));
-	C = malloc(xdim*ydim*sizeof(float));
-
-	/* fill the A and B matrices */
-	srand(2009);
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < zdim; i++) {
-			A[j+i*ydim] = (float)(starpu_drand48());
-		}
-	}
-
-	for (j=0; j < zdim; j++) {
-		for (i=0; i < xdim; i++) {
-			B[j+i*zdim] = (float)(starpu_drand48());
-		}
-	}
-
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < xdim; i++) {
-			C[j+i*ydim] = (float)(0);
-		}
-	}
-}
-
-static void partition_mult_data(void)
-{
-	/* note that we assume a FORTRAN ordering here! */
-
-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
-		ydim, ydim, zdim, sizeof(float));
-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
-		zdim, zdim, xdim, sizeof(float));
-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
-		ydim, ydim, xdim, sizeof(float));
-
-	/* A filter is a method to partition a data into disjoint chunks, it is
-	 * described by the means of the "struct starpu_data_filter" structure that
-	 * contains a function that is applied on a data handle to partition it
-	 * into smaller chunks, and an argument that is passed to the function
-	 * (eg. the number of blocks to create here).
-	 */
-
-	struct starpu_data_filter vert = {
-		.filter_func = starpu_vertical_block_filter_func,
-		.nchildren = nslicesx,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
-	};
-
-	struct starpu_data_filter horiz = {
-		.filter_func = starpu_block_filter_func,
-		.nchildren = nslicesy,
-		.get_nchildren = NULL,
-		.get_child_ops = NULL
-	};
-
-/*
- *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
- *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
- *	the number of filters to apply, and the indexes for each filters, for
- *	instance:
- *
- *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
- *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
- *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
- *
- *	Note that here we applied 2 filters recursively onto C.
- *
- *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
- *	of blocked matrix C for example.
- *
- *		              |---|---|---|---|
- *		              |   |   | B'|   | B
- *		              |---|---|---|---|
- *		                0   1   2   3
- *		     |----|   |---|---|---|---|
- *		     |    |   |   |   |   |   |
- *		     |    | 0 |   |   |   |   |
- *		     |----|   |---|---|---|---|
- *		     | A' |   |   |   | C'|   |
- *		     |    |   |   |   |   |   |
- *		     |----|   |---|---|---|---|
- *		       A              C
- *
- *	IMPORTANT: applying filters is equivalent to partitionning a piece of
- *	data in a hierarchical manner, so that memory consistency is enforced
- *	for each of the elements independantly. The tasks should therefore NOT
- *	access inner nodes (eg. one column of C or the whole C) but only the
- *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
- *	possible by disapplying the filters (using starpu_data_unpartition), to
- *	enforce memory consistency.
- */
-
-	starpu_data_partition(B_handle, &vert);
-	starpu_data_partition(A_handle, &horiz);
-
-	/* starpu_data_map_filters is a variable-arity function, the first argument
-	 * is the handle of the data to partition, the second argument is the
-	 * number of filters to apply recursively. Filters are applied in the
-	 * same order as the arguments.
-	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
-	 * then applying horiz on each sub-data (ie. each column of C)
-	 */
-	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
-}
-
-static struct starpu_perfmodel_t starpu_dgemm_model_common = {
-	.cost_model = mult_gemm_cost,
-	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
-	.symbol = "mult_perf_model"
-};
-
-/*
-static struct starpu_perfmodel_t mult_perf_model = {
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "mult_perf_model"
-};
-*/
-
-struct starpu_conf conf = {
-		.sched_policy_name = "heft",
-		.calibrate = 1,
-		.ncpus = 4
-};
-
-
-static starpu_codelet cl = {
-        /* we can only execute that kernel on a CPU yet */
-        .where = STARPU_CPU,
-        //.starpu_impl_multiple = 1,
-        /* CPU implementation of the codelet */
-        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
-        .cpu_funcs = {cpu_mult,cpu_mult_2},
-        /* the codelet manipulates 3 buffers that are managed by the
-         * DSM */
-        .nbuffers = 3,
-        /* in case the scheduling policy may use performance models */
-        .model = &starpu_dgemm_model_common
-};
-
-static void launch_tasks(void)
-{
-	/* partition the work into slices */
-	unsigned taskx, tasky;
-
-	for (taskx = 0; taskx < nslicesx; taskx++)
-	{
-		for (tasky = 0; tasky < nslicesy; tasky++)
-		{
-			/* C[taskx, tasky] = A[tasky] B[taskx] */
-
-			/* by default, starpu_task_create() returns an
- 			 * asynchronous task (ie. task->synchronous = 0) */
-			struct starpu_task *task = starpu_task_create();
-
-			/* this task implements codelet "cl" */
-			task->cl = &cl;
-
-			/*
-			 *              |---|---|---|---|
-			 *              |   | * |   |   | B
-			 *              |---|---|---|---|
-			 *                    X
-			 *     |----|   |---|---|---|---|
-			 *     |****| Y |   |***|   |   |
-			 *     |****|   |   |***|   |   |
-			 *     |----|   |---|---|---|---|
-			 *     |    |   |   |   |   |   |
-			 *     |    |   |   |   |   |   |
-			 *     |----|   |---|---|---|---|
-			 *       A              C
-			 */
-
-			/* there was a single filter applied to matrices A
-			 * (respectively B) so we grab the handle to the chunk
-			 * identified by "tasky" (respectively "taskx). The "1"
-			 * tells StarPU that there is a single argument to the
-			 * variable-arity function starpu_data_get_sub_data */
-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
-			task->buffers[0].mode = STARPU_R;
-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
-			task->buffers[1].mode = STARPU_R;
-
-			/* 2 filters were applied on matrix C, so we give
-			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
-			 * must match the order in which the filters were
-			 * applied.
-			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
-			 * a handle to the column number k of matrix C.
-			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
-			 * equivalent to
-			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
-			task->buffers[2].mode = STARPU_W;
-
-			/* this is not a blocking call since task->synchronous = 0 */
-			int summit_task;
-			summit_task = starpu_task_submit(task);
-			printf("task is submmited or not %d\n",summit_task);
-
-		}
-	}
-}
-
-int main(void)
-{
-	/* start the runtime */
-	starpu_init(&conf);
-
-	/* initialize matrices A, B and C and register them to StarPU */
-	init_problem_data();
-
-	/* partition matrices into blocks that can be manipulated by the
- 	 * codelets */
-	partition_mult_data();
-
-	/* submit all tasks in an asynchronous fashion */
-	launch_tasks();
-
-	/* wait for termination */
-	starpu_task_wait_for_all();
-
-	/* remove the filters applied by the means of starpu_data_map_filters; now
- 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
-	 * starpu_data_map_filters is called again on C_handle.
-	 * The second argument is the memory node where the different subsets
-	 * should be reassembled, 0 = main memory (RAM) */
-	starpu_data_unpartition(C_handle, 0);
-
-	/* stop monitoring matrix C : after this, it is not possible to pass C
-	 * (or any subset of C) as a codelet input/output. This also implements
-	 * a barrier so that the piece of data is put back into main memory in
-	 * case it was only available on a GPU for instance. */
-	starpu_data_unregister(C_handle);
-
-	starpu_shutdown();
-
-	return 0;
-}

+ 329 - 0
examples/basic_examples/multiformat.c

@@ -0,0 +1,329 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2012 Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+#include "multiformat_types.h"
+
+static int ncpu = 0;
+#ifdef STARPU_USE_CUDA
+static int ncuda = 0;
+#endif
+#ifdef STARPU_USE_OPENCL
+static int nopencl = 0;
+#endif
+
+static struct point array_of_structs[N_ELEMENTS];
+static starpu_data_handle_t array_of_structs_handle;
+
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+{
+	struct point *aos;
+	unsigned int n, i;
+
+	aos = (struct point *) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+	n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	for (i = 0; i < n; i++)
+	{
+		aos[i].x *= aos[i].y;
+	}
+}
+
+#ifdef STARPU_USE_CUDA
+extern struct starpu_codelet cpu_to_cuda_cl;
+extern struct starpu_codelet cuda_to_cpu_cl;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern struct starpu_codelet cpu_to_opencl_cl;
+extern struct starpu_codelet opencl_to_cpu_cl;
+#endif
+
+static struct starpu_multiformat_data_interface_ops format_ops =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_elemsize = 2* sizeof(float),
+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_elemsize = 2 * sizeof(float),
+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
+	.cpu_elemsize = sizeof(struct point),
+
+};
+
+#ifdef STARPU_USE_CUDA
+extern void multiformat_scal_cuda_func(void *buffers[], void *arg);
+#endif
+#ifdef STARPU_USE_OPENCL
+extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
+#endif
+
+#ifdef STARPU_USE_CPU
+static struct starpu_codelet cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {multiformat_scal_cpu_func, NULL},
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "codelet_real"
+};
+#endif /* !STARPU_USE_CPU */
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = { multiformat_scal_cuda_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "cuda_codelet"
+};
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static struct starpu_codelet opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { multiformat_scal_opencl_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "opencl_codelet"
+};
+#endif /* !STARPU_USE_OPENCL */
+
+/*
+ * Main functions 
+ */
+static void
+init_problem_data(void)
+{
+	int i; 
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		array_of_structs[i].x = 1.0 + i;
+		array_of_structs[i].y = 42.0;
+	}
+}
+
+static void
+register_data(void)
+{
+	starpu_multiformat_data_register(&array_of_structs_handle,
+					 0,
+					 &array_of_structs,
+					 N_ELEMENTS,
+					 &format_ops);
+}
+
+static int
+create_and_submit_task(unsigned int dev)
+{
+	struct starpu_task *task = starpu_task_create();
+	switch (dev)
+	{
+#ifdef STARPU_USE_CPU
+		case STARPU_CPU:
+			task->cl = &cpu_cl;
+			break;
+#endif
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA:
+			task->cl = &cuda_cl;
+			break;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL:
+			task->cl = &opencl_cl;
+			break;
+#endif
+		default:
+			assert(0);
+	}
+	task->synchronous = 1;
+	task->handles[0] = array_of_structs_handle;
+	task->cl_arg = NULL;
+	task->cl_arg_size = 0;
+	return starpu_task_submit(task);
+}
+
+static void
+create_and_submit_tasks(void)
+{
+	int err;
+
+#ifdef STARPU_USE_CUDA
+	if (ncuda > 0)
+	{
+		err = create_and_submit_task(STARPU_CUDA);
+		if (err != 0)
+		{
+			FPRINTF(stderr, "Cuda : %s\n", strerror(-err));
+			return;
+		}
+	}
+#endif
+
+#ifdef STARPU_USE_CPU
+	if (ncpu > 0)
+	{
+		err = create_and_submit_task(STARPU_CPU);
+		if (err != 0)
+		{
+			FPRINTF(stderr, "CPU : %s\n", strerror(-err));
+			return;
+		}
+	}
+#endif
+
+#ifdef STARPU_USE_OPENCL
+	if (nopencl > 0)
+	{
+		err = create_and_submit_task(STARPU_OPENCL);
+		if (err != 0)
+		{
+			FPRINTF(stderr, "OpenCL : %s\n", strerror(-err));
+			return;
+		}
+	}
+#endif /* !STARPU_USE_OPENCL */
+}
+
+static void
+unregister_data(void)
+{
+	starpu_data_unregister(array_of_structs_handle);
+}
+
+static void
+print_it(void)
+{
+	int i;
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		FPRINTF(stderr, "(%.2f %.2f) ",
+			array_of_structs[i].x,
+			array_of_structs[i].y);
+	}
+	FPRINTF(stderr, "\n");
+}
+
+static int
+check_it(void)
+{
+	int i;
+	for (i = 0; i < N_ELEMENTS; i++)
+	{
+		float expected_value = i + 1.0;
+#ifdef STARPU_USE_CUDA
+		if (ncuda > 0)
+			expected_value *= array_of_structs[i].y;
+#endif
+#ifdef STARPU_USE_OPENCL
+		if (nopencl > 0)
+			expected_value *= array_of_structs[i].y;
+#endif
+		expected_value *= array_of_structs[i].y;
+		if (array_of_structs[i].x != expected_value)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+struct starpu_opencl_program opencl_conversion_program;
+#endif
+
+static int
+gpus_available()
+{
+#ifdef STARPU_USE_CUDA
+	if (ncuda > 0)
+		return 1;
+#endif
+#ifdef STARPU_USE_OPENCL
+	if (nopencl > 0)
+		return 1;
+#endif
+
+	return 0;
+}
+
+int
+main(void)
+{
+#ifdef STARPU_USE_CPU
+	int ret;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ncpu = starpu_cpu_worker_get_count();
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_cuda_worker_get_count();
+#endif
+#ifdef STARPU_USE_OPENCL
+	nopencl = starpu_opencl_worker_get_count();
+#endif
+
+	if (ncpu == 0 || !gpus_available())
+	{
+		starpu_shutdown();
+		return 77;
+	}
+
+#ifdef STARPU_USE_OPENCL
+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_conversion_codelets_opencl_kernel.cl", 
+						  &opencl_conversion_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+#endif
+	init_problem_data();
+
+	print_it();
+
+	register_data();
+
+	create_and_submit_tasks();
+
+	unregister_data();
+
+	print_it();
+
+#ifdef STARPU_USE_OPENCL
+        starpu_opencl_unload_opencl(&opencl_program);
+        starpu_opencl_unload_opencl(&opencl_conversion_program);
+#endif
+	starpu_shutdown();
+
+
+	return check_it();
+#else
+	/* Without the CPU, there is no point in using the multiformat
+	 * interface, so this test is pointless. */
+	return 77;
+#endif
+}

+ 81 - 0
examples/basic_examples/multiformat_conversion_codelets.c

@@ -0,0 +1,81 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "multiformat_types.h"
+
+#ifdef STARPU_USE_CUDA
+void cuda_to_cpu(void *buffers[], void *arg)
+{
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
+struct starpu_codelet cpu_to_cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_cuda"
+};
+
+struct starpu_codelet cuda_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {cuda_to_cpu, NULL},
+	.nbuffers = 1,
+	.name = "codelet_cude_to_cpu"
+};
+#endif
+
+#ifdef STARPU_USE_OPENCL
+void opencl_to_cpu(void *buffers[], void *arg)
+{
+	FPRINTF(stderr, "User Entering %s\n", __func__);
+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+	struct point *dst = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		dst[i].x = src->x[i];
+		dst[i].y = src->y[i];
+	}
+}
+
+extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
+struct starpu_codelet cpu_to_opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
+	.nbuffers = 1
+};
+
+struct starpu_codelet opencl_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {opencl_to_cpu, NULL},
+	.nbuffers = 1
+};
+#endif

+ 50 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "multiformat_types.h"
+
+static __global__ void cpu_to_cuda_cuda(struct point *src,
+	struct struct_of_arrays *dst, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+	{
+		dst->x[i] = src[i].x;
+		dst->y[i] = src[i].y;
+	}
+
+}
+
+extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
+{
+	struct point *src;
+	struct struct_of_arrays *dst;
+
+	src = (struct point *) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+	dst = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+
+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+
+        cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 99 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -0,0 +1,99 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program opencl_conversion_program;
+
+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_conversion_program,
+					"cpu_to_opencl_opencl",
+					devid);
+
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 14 - 4
examples/starpufft/cudaf_kernels.cu

@@ -1,7 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,5 +14,16 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "float.h"
-#include "cudax_kernels.cu"
+#include "multiformat_types.h"
+
+__kernel void cpu_to_opencl_opencl(__global struct point *src,
+				   __global struct struct_of_arrays *dst,
+				   unsigned int n)
+{
+	const unsigned int i = get_global_id(0);
+	if (i < n)
+	{
+		dst->x[i] = src[i].x;
+		dst->y[i] = src[i].y;
+	}
+}

+ 43 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "multiformat_types.h"
+
+static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+		soa->x[i] *= soa->y[i];
+}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+{
+	(void) _args;
+
+	FPRINTF(stderr, "Running the cuda kernel (%s)\n", __func__);
+	unsigned int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	struct struct_of_arrays *soa;
+
+	soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+        multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 92 - 0
examples/basic_examples/multiformat_opencl.c

@@ -0,0 +1,92 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+
+extern struct starpu_opencl_program opencl_program;
+
+void multiformat_scal_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"multiformat_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
+	if (err)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(queue,
+					kernel,
+					1,
+					NULL,
+					&global,
+					&local,
+					0,
+					NULL,
+					&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 8 - 5
examples/starpufft/starpufft_common.c

@@ -1,7 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "starpufft.h"
+#include "multiformat_types.h"
 
-/* Used as an identifier in starpu tags to let plans run concurrently */
-int starpufft_last_plan_number;
+__kernel void multiformat_opencl(__global struct struct_of_arrays *soa, int nx)
+{
+        const int i = get_global_id(0);
+        if (i < nx)
+		soa->x[i] *= soa->y[i];
+}

+ 33 - 0
examples/basic_examples/multiformat_types.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#ifndef MULTIFORMAT_TYPES_H
+#define MULTIFORMAT_TYPES_H
+
+#define N_ELEMENTS 10
+
+struct struct_of_arrays
+{
+	float x[N_ELEMENTS];
+	float y[N_ELEMENTS];
+};
+struct point
+{
+	float x, y;
+};
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+#endif

+ 18 - 10
examples/basic_examples/variable.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -38,10 +38,13 @@ int main(int argc, char **argv)
 {
 	unsigned i;
         float foo;
-	starpu_data_handle float_array_handle;
-	starpu_codelet cl = {};
+	starpu_data_handle_t float_array_handle;
+	struct starpu_codelet cl = {};
+	int ret;
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 #ifdef STARPU_SLOW_MACHINE
 	niter /= 100;
@@ -53,18 +56,20 @@ int main(int argc, char **argv)
                                       (uintptr_t)&foo, sizeof(float));
 
 #ifdef STARPU_USE_OPENCL
-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
+        ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
-        cl.cpu_func = cpu_codelet;
+        cl.cpu_funcs[0] = cpu_codelet;
 #ifdef STARPU_USE_CUDA
-        cl.cuda_func = cuda_codelet;
+        cl.cuda_funcs[0] = cuda_codelet;
 #endif
 #ifdef STARPU_USE_OPENCL
-        cl.opencl_func = opencl_codelet;
+        cl.opencl_funcs[0] = opencl_codelet;
 #endif
         cl.nbuffers = 1;
+	cl.modes[0] = STARPU_RW;
         cl.model = NULL;
 
 	for (i = 0; i < niter; i++)
@@ -76,8 +81,7 @@ int main(int argc, char **argv)
 
 		task->callback_func = NULL;
 
-		task->buffers[0].handle = float_array_handle;
-		task->buffers[0].mode = STARPU_RW;
+		task->handles[0] = float_array_handle;
 
 		ret = starpu_task_submit(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
@@ -97,4 +101,8 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 	return 0;
+
+enodev:
+	starpu_shutdown();
+	return 77;
 }

+ 0 - 1
examples/basic_examples/variable_kernels_opencl.c

@@ -33,7 +33,6 @@ void opencl_codelet(void *descr[], void *_args)
 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable", devid);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = 0;
 	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 

+ 57 - 24
examples/basic_examples/vector_scal.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,44 +19,64 @@
  * This example demonstrates how to use StarPU to scale an array by a factor.
  * It shows how to manipulate data with StarPU's data management library.
  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
- *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  2- how to describe which data are accessed by a task (task->handles[0])
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  */
 
 #include <starpu.h>
 #include <starpu_opencl.h>
+#include <stdlib.h>
 #include <stdio.h>
+#include <math.h>
 
-#define	NX	2048
+#define	NX	204800
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
 extern void scal_cpu_func(void *buffers[], void *_args);
+extern void scal_cpu_func_icc(void *buffers[], void *_args);
+extern void scal_sse_func(void *buffers[], void *_args);
+extern void scal_sse_func_icc(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
-static struct starpu_perfmodel_t vector_scal_model = {
+static struct starpu_perfmodel vector_scal_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "vector_scale"
 };
 
-static struct starpu_perfmodel_t vector_scal_power_model = {
+static struct starpu_perfmodel vector_scal_power_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "vector_scale_power"
 };
 
-static starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
 	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
 	/* CPU implementation of the codelet */
-	.cpu_func = scal_cpu_func,
+	.cpu_funcs = {
+		scal_cpu_func
+#ifdef STARPU_HAVE_ICC
+		, scal_cpu_func_icc
+#endif
+#ifdef __SSE__
+		, scal_sse_func
+#ifdef STARPU_HAVE_ICC
+		, scal_sse_func_icc
+#endif
+#endif
+	},
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
-	.cuda_func = scal_cuda_func,
+	.cuda_funcs = {scal_cuda_func, NULL},
 #endif
 #ifdef STARPU_USE_OPENCL
 	/* OpenCL implementation of the codelet */
-	.opencl_func = scal_opencl_func,
+	.opencl_funcs = {scal_opencl_func, NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &vector_scal_model,
 	.power_model = &vector_scal_power_model
 };
@@ -65,6 +85,13 @@ static starpu_codelet cl = {
 struct starpu_opencl_program opencl_program;
 #endif
 
+static int approximately_equal(float a, float b)
+{
+	int ai = (int) nearbyintf(a * 1000.0);
+	int bi = (int) nearbyintf(b * 1000.0);
+	return ai == bi;
+}
+
 int main(int argc, char **argv)
 {
 	/* We consider a vector of float that is initialized just as any of C
@@ -74,15 +101,17 @@ int main(int argc, char **argv)
 	for (i = 0; i < NX; i++)
                 vector[i] = (i+1.0f);
 
-	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
-	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
-
 	/* Initialize StarPU with default configuration */
-	starpu_init(NULL);
+	int ret = starpu_init(NULL);
+	if (ret == -ENODEV) goto enodev;
+
+	FPRINTF(stderr, "[BEFORE] 1-th element    : %3.2f\n", vector[1]);
+	FPRINTF(stderr, "[BEFORE] (NX-1)th element: %3.2f\n", vector[NX-1]);
 
 #ifdef STARPU_USE_OPENCL
-	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
-					    &opencl_program, NULL);
+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
+						  &opencl_program, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
@@ -98,7 +127,7 @@ int main(int argc, char **argv)
 	 *  - the fourth argument is the number of elements in the vector
 	 *  - the fifth argument is the size of each element.
 	 */
-	starpu_data_handle vector_handle;
+	starpu_data_handle_t vector_handle;
 	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
@@ -111,8 +140,7 @@ int main(int argc, char **argv)
 	task->cl = &cl;
 
 	/* the codelet manipulates one buffer in RW mode */
-	task->buffers[0].handle = vector_handle;
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = vector_handle;
 
 	/* an argument is passed to the codelet, beware that this is a
 	 * READ-ONLY buffer and that the codelet may be given a pointer to a
@@ -121,14 +149,13 @@ int main(int argc, char **argv)
 	task->cl_arg_size = sizeof(factor);
 
 	/* execute the task on any eligible computational ressource */
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* StarPU does not need to manipulate the array anymore so we can stop
  	 * monitoring it */
 	starpu_data_unregister(vector_handle);
 
-	starpu_task_destroy(task);
-
 #ifdef STARPU_USE_OPENCL
         starpu_opencl_unload_opencl(&opencl_program);
 #endif
@@ -136,8 +163,14 @@ int main(int argc, char **argv)
 	/* terminate StarPU, no task can be submitted after */
 	starpu_shutdown();
 
-	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
-	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
+	FPRINTF(stderr, "[AFTER] 1-th element     : %3.2f (should be %3.2f)\n", vector[1], (1+1.0f) * factor);
+	FPRINTF(stderr, "[AFTER] (NX-1)-th element: %3.2f (should be %3.2f)\n", vector[NX-1], (NX-1+1.0f) * factor);
+
+	return ((approximately_equal(vector[1], (1+1.0f) * factor)
+		 && approximately_equal(vector[NX-1], (NX-1+1.0f) * factor))
+		? EXIT_SUCCESS
+		: EXIT_FAILURE);
 
-	return 0;
+enodev:
+	return 77;
 }

+ 18 - 13
examples/basic_examples/vector_scal_c.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@
  * This example demonstrates how to use StarPU to scale an array by a factor.
  * It shows how to manipulate data with StarPU's data management library.
  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
- *  2- how to describe which data are accessed by a task (task->buffers[0])
+ *  2- how to describe which data are accessed by a task (task->handles[0])
  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
  *
  * This is a variant of vector_scal.c which shows it can be integrated with fortran.
@@ -33,18 +33,21 @@
 extern void scal_cpu_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 
-static struct starpu_perfmodel_t vector_scal_model = {
+static struct starpu_perfmodel vector_scal_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "vector_scale_model"
 };
 
-static starpu_codelet cl = {
-  .where = STARPU_CPU | STARPU_CUDA,
+static struct starpu_codelet cl =
+{
+	.modes = { STARPU_RW },
+	.where = STARPU_CPU | STARPU_CUDA,
 	/* CPU implementation of the codelet */
-	.cpu_func = scal_cpu_func,
+	.cpu_funcs = {scal_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
-	.cuda_func = scal_cuda_func,
+	.cuda_funcs = {scal_cuda_func, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &vector_scal_model
@@ -53,9 +56,11 @@ static starpu_codelet cl = {
 void compute_(int *F_NX, float *vector)
 {
         int NX = *F_NX;
-	
+	int ret;
+
 	/* Initialize StarPU with default configuration */
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
 	 * identifier. When a task needs to access a piece of data, it should
@@ -70,7 +75,7 @@ void compute_(int *F_NX, float *vector)
 	 *  - the fourth argument is the number of elements in the vector
 	 *  - the fifth argument is the size of each element.
 	 */
-	starpu_data_handle vector_handle;
+	starpu_data_handle_t vector_handle;
 	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
 
 	float factor = 3.14;
@@ -83,8 +88,7 @@ void compute_(int *F_NX, float *vector)
 	task->cl = &cl;
 
 	/* the codelet manipulates one buffer in RW mode */
-	task->buffers[0].handle = vector_handle;
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = vector_handle;
 
 	/* an argument is passed to the codelet, beware that this is a
 	 * READ-ONLY buffer and that the codelet may be given a pointer to a
@@ -93,7 +97,8 @@ void compute_(int *F_NX, float *vector)
 	task->cl_arg_size = sizeof(factor);
 
 	/* execute the task on any eligible computational ressource */
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* StarPU does not need to manipulate the array anymore so we can stop
  	 * monitoring it */

+ 35 - 5
examples/basic_examples/vector_scal_cpu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,9 @@
  */
 
 #include <starpu.h>
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
 
 /* This kernel takes a buffer and scales it by a constant factor */
 void scal_cpu_func(void *buffers[], void *cl_arg)
@@ -27,17 +30,17 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 	float *factor = (float *) cl_arg;
 
 	/*
-	 * The "buffers" array matches the task->buffers array: for instance
-	 * task->buffers[0].handle is a handle that corresponds to a data with
+	 * The "buffers" array matches the task->handles array: for instance
+	 * task->handles[0] is a handle that corresponds to a data with
 	 * vector "interface", so that the first entry of the array in the
 	 * codelet  is a pointer to a structure describing such a vector (ie.
-	 * struct starpu_vector_interface_s *). Here, we therefore manipulate
+	 * struct starpu_vector_interface *). Here, we therefore manipulate
 	 * the buffers[0] element as a vector: nx gives the number of elements
 	 * in the array, ptr gives the location of the array (that was possibly
 	 * migrated/replicated), and elemsize gives the size of each elements.
 	 */
 
-	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
 
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(vector);
@@ -52,3 +55,30 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 		val[i] *= *factor;
 }
 
+#ifdef __SSE__
+void scal_sse_func(void *buffers[], void *cl_arg)
+{
+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+	unsigned int n_iterations = n/4;
+
+	__m128 *VECTOR = (__m128*) vector;
+	__m128 FACTOR __attribute__((aligned(16)));
+	float factor = *(float *) cl_arg;
+	FACTOR = _mm_set1_ps(factor);
+
+	unsigned int i;	
+	for (i = 0; i < n_iterations; i++)
+		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
+
+	unsigned int remainder = n%4;
+	if (remainder != 0)
+	{
+		unsigned int start = 4 * n_iterations;
+		for (i = start; i < start+remainder; ++i)
+		{
+			vector[i] = factor * vector[i];
+		}
+	}
+}
+#endif

+ 1 - 0
examples/basic_examples/vector_scal_cpu_icc.icc

@@ -0,0 +1 @@
+vector_scal_cpu.c

+ 1 - 1
examples/basic_examples/vector_scal_opencl.c

@@ -37,7 +37,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 	/* length of the vector */
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);

+ 3 - 2
examples/basic_examples/vector_scal_opencl_kernel.cl

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,7 +17,8 @@
 __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
 {
         const int i = get_global_id(0);
-        if (i < nx) {
+        if (i < nx)
+	{
                 val[i] *= factor;
         }
 }

+ 25 - 11
examples/callback/callback.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 
-starpu_data_handle handle;
+starpu_data_handle_t handle;
 
 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
@@ -30,37 +30,47 @@ void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 	*val += 1;
 }
 
-starpu_codelet cl =
+struct starpu_codelet cl =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU,
-	.cpu_func = cpu_codelet,
+	.cpu_funcs = {cpu_codelet, NULL},
 	.nbuffers = 1
 };
 
 void callback_func(void *callback_arg)
 {
+	int ret;
+
 	struct starpu_task *task = starpu_task_create();
 	task->cl = &cl;
-	task->buffers[0].handle = handle;
-	task->buffers[0].mode = STARPU_RW;
-	starpu_task_submit(task);
+	task->handles[0] = handle;
+
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 int main(int argc, char **argv)
 {
 	int v=40;
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_init(NULL);
 	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
 
 	struct starpu_task *task = starpu_task_create();
 	task->cl = &cl;
 	task->callback_func = callback_func;
 	task->callback_arg = NULL;
-	task->buffers[0].handle = handle;
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = handle;
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	starpu_task_wait_for_all();
 	starpu_data_unregister(handle);
@@ -70,4 +80,8 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 	return 0;
+
+enodev:
+	starpu_shutdown();
+	return 77;
 }

+ 84 - 23
examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -72,22 +72,22 @@ static int long long n = 1024;
 static int nblocks = 8;
 static int use_reduction = 1;
 
-static starpu_data_handle A_handle, b_handle, x_handle;
+static starpu_data_handle_t A_handle, b_handle, x_handle;
 static TYPE *A, *b, *x;
 
 static int i_max = 4000;
 static double eps = (10e-14);
 
-static starpu_data_handle r_handle, d_handle, q_handle;
+static starpu_data_handle_t r_handle, d_handle, q_handle;
 static TYPE *r, *d, *q;
 
-static starpu_data_handle dtq_handle, rtr_handle;
+static starpu_data_handle_t dtq_handle, rtr_handle;
 static TYPE dtq, rtr;
 
-extern starpu_codelet accumulate_variable_cl;
-extern starpu_codelet accumulate_vector_cl;
-extern starpu_codelet bzero_variable_cl;
-extern starpu_codelet bzero_vector_cl;
+extern struct starpu_codelet accumulate_variable_cl;
+extern struct starpu_codelet accumulate_vector_cl;
+extern struct starpu_codelet bzero_variable_cl;
+extern struct starpu_codelet bzero_vector_cl;
 
 /*
  *	Generate Input data
@@ -125,6 +125,16 @@ static void generate_random_problem(void)
 	memset(q, 0, n*sizeof(TYPE));
 }
 
+static void free_data(void)
+{
+	starpu_free(A);
+	starpu_free(b);
+	starpu_free(x);
+	starpu_free(r);
+	starpu_free(d);
+	starpu_free(q);
+}
+
 static void register_data(void)
 {
 	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, n, n, n, sizeof(TYPE));
@@ -148,6 +158,28 @@ static void register_data(void)
 	}
 }
 
+static void unregister_data(void)
+{
+	starpu_data_unpartition(A_handle, 0);
+	starpu_data_unpartition(b_handle, 0);
+	starpu_data_unpartition(x_handle, 0);
+
+	starpu_data_unpartition(r_handle, 0);
+	starpu_data_unpartition(d_handle, 0);
+	starpu_data_unpartition(q_handle, 0);
+
+	starpu_data_unregister(A_handle);
+	starpu_data_unregister(b_handle);
+	starpu_data_unregister(x_handle);
+
+	starpu_data_unregister(r_handle);
+	starpu_data_unregister(d_handle);
+	starpu_data_unregister(q_handle);
+
+	starpu_data_unregister(dtq_handle);
+	starpu_data_unregister(rtr_handle);
+}
+
 /*
  *	Data partitioning filters
  */
@@ -194,7 +226,7 @@ static void partition_data(void)
  */
 
 #if 0
-static void display_vector(starpu_data_handle handle, TYPE *ptr)
+static void display_vector(starpu_data_handle_t handle, TYPE *ptr)
 {
 	unsigned block_size = n / nblocks;
 
@@ -230,24 +262,29 @@ static void display_matrix(void)
  *	Main loop
  */
 
-static void cg(void)
+static int cg(void)
 {
 	double delta_new, delta_old, delta_0;
 	double alpha, beta;
 
 	int i = 0;
+	int ret;
 
 	/* r <- b */
-	copy_handle(r_handle, b_handle, nblocks);
+	ret = copy_handle(r_handle, b_handle, nblocks);
+	if (ret == -ENODEV) return ret;
 
 	/* r <- r - A x */
-	gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
+	if (ret == -ENODEV) return ret;
 
 	/* d <- r */
-	copy_handle(d_handle, r_handle, nblocks);
+	ret = copy_handle(d_handle, r_handle, nblocks);
+	if (ret == -ENODEV) return ret;
 
 	/* delta_new = dot(r,r) */
-	dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+	if (ret == -ENODEV) return ret;
 
 	starpu_data_acquire(rtr_handle, STARPU_R);
 	delta_new = rtr;
@@ -285,7 +322,8 @@ static void cg(void)
 			/* r <- r - A x */
 			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
 		}
-		else {
+		else
+		{
 			/* r <- r - alpha q */
 			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
 		}
@@ -318,6 +356,7 @@ static void cg(void)
 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
 	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
 	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
+	return 0;
 }
 
 static int check(void)
@@ -328,28 +367,34 @@ static int check(void)
 static void parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-	        if (strcmp(argv[i], "-n") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+	        if (strcmp(argv[i], "-n") == 0)
+		{
 			n = (int long long)atoi(argv[++i]);
 			continue;
 		}
 
-	        if (strcmp(argv[i], "-maxiter") == 0) {
+	        if (strcmp(argv[i], "-maxiter") == 0)
+		{
 			i_max = atoi(argv[++i]);
 			continue;
 		}
 
-	        if (strcmp(argv[i], "-nblocks") == 0) {
+	        if (strcmp(argv[i], "-nblocks") == 0)
+		{
 			nblocks = atoi(argv[++i]);
 			continue;
 		}
 
-	        if (strcmp(argv[i], "-no-reduction") == 0) {
+	        if (strcmp(argv[i], "-no-reduction") == 0)
+		{
 			use_reduction = 0;
 			continue;
 		}
 
-	        if (strcmp(argv[i], "-h") == 0) {
+	        if (strcmp(argv[i], "-h") == 0)
+		{
 			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 			continue;
@@ -361,21 +406,37 @@ int main(int argc, char **argv)
 {
 	int ret;
 
+#ifdef STARPU_SLOW_MACHINE
+	i_max = 16;
+#endif
+
 	parse_args(argc, argv);
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
 	starpu_helper_cublas_init();
 
 	generate_random_problem();
 	register_data();
 	partition_data();
 
-	cg();
+	ret = cg();
+	if (ret == -ENODEV) goto enodev;
 
 	ret = check();
 
+	starpu_task_wait_for_all();
+	unregister_data();
+	free_data();
 	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 
 	return ret;
+
+enodev:
+	starpu_shutdown();
+	return 77;
 }

+ 15 - 15
examples/cg/cg.h

@@ -57,29 +57,29 @@
 #define cublasscal	cublasSscal
 #endif
 
-void dot_kernel(starpu_data_handle v1,
-                starpu_data_handle v2,
-                starpu_data_handle s,
-		unsigned nblocks,
-		int use_reduction);
+int dot_kernel(starpu_data_handle_t v1,
+	       starpu_data_handle_t v2,
+	       starpu_data_handle_t s,
+	       unsigned nblocks,
+	       int use_reduction);
 
-void gemv_kernel(starpu_data_handle v1,
-                starpu_data_handle matrix, 
-                starpu_data_handle v2,
+int gemv_kernel(starpu_data_handle_t v1,
+                starpu_data_handle_t matrix, 
+                starpu_data_handle_t v2,
                 TYPE p1, TYPE p2,
 		unsigned nblocks,
 		int use_reduction);
 
-void axpy_kernel(starpu_data_handle v1,
-		starpu_data_handle v2, TYPE p1,
+int axpy_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t v2, TYPE p1,
 		unsigned nblocks);
 
-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
-			starpu_data_handle v2, TYPE p2,
-			unsigned nblocks);
+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
+		     starpu_data_handle_t v2, TYPE p2,
+		     unsigned nblocks);
 
-void copy_handle(starpu_data_handle dst,
-		starpu_data_handle src,
+int copy_handle(starpu_data_handle_t dst,
+		starpu_data_handle_t src,
 		unsigned nblocks);
 
 #endif /* __STARPU_EXAMPLE_CG_H__ */

+ 127 - 139
examples/cg/cg_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -68,16 +68,18 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 	*v_dst = *v_dst + *v_src;
 }
 
-static struct starpu_perfmodel_t accumulate_variable_model = {
+static struct starpu_perfmodel accumulate_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_variable"
 };
 
-starpu_codelet accumulate_variable_cl = {
+struct starpu_codelet accumulate_variable_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_variable_cpu,
+	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_variable_cuda,
+	.cuda_funcs = {accumulate_variable_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &accumulate_variable_model
@@ -104,16 +106,18 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
 }
 
-static struct starpu_perfmodel_t accumulate_vector_model = {
+static struct starpu_perfmodel accumulate_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "accumulate_vector"
 };
 
-starpu_codelet accumulate_vector_cl = {
+struct starpu_codelet accumulate_vector_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = accumulate_vector_cpu,
+	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = accumulate_vector_cuda,
+	.cuda_funcs = {accumulate_vector_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &accumulate_vector_model
@@ -142,16 +146,18 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 	*v = (TYPE)0.0;
 }
 
-static struct starpu_perfmodel_t bzero_variable_model = {
+static struct starpu_perfmodel bzero_variable_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_variable"
 };
 
-starpu_codelet bzero_variable_cl = {
+struct starpu_codelet bzero_variable_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_variable_cpu,
+	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_variable_cuda,
+	.cuda_funcs = {bzero_variable_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &bzero_variable_model
@@ -177,16 +183,18 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 	memset(v, 0, n*sizeof(TYPE));
 }
 
-static struct starpu_perfmodel_t bzero_vector_model = {
+static struct starpu_perfmodel bzero_vector_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "bzero_vector"
 };
 
-starpu_codelet bzero_vector_cl = {
+struct starpu_codelet bzero_vector_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = bzero_vector_cpu,
+	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = bzero_vector_cuda,
+	.cuda_funcs = {bzero_vector_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &bzero_vector_model
@@ -230,39 +238,47 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 	*dot = *dot + local_dot;
 }
 
-static struct starpu_perfmodel_t dot_kernel_model = {
+static struct starpu_perfmodel dot_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "dot_kernel"
 };
 
-static starpu_codelet dot_kernel_cl = {
+static struct starpu_codelet dot_kernel_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = dot_kernel_cpu,
+	.cpu_funcs = {dot_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = dot_kernel_cuda,
+	.cuda_funcs = {dot_kernel_cuda, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &dot_kernel_model
 };
 
-void dot_kernel(starpu_data_handle v1,
-		starpu_data_handle v2,
-		starpu_data_handle s,
-		unsigned nblocks,
-		int use_reduction)
+int dot_kernel(starpu_data_handle_t v1,
+	       starpu_data_handle_t v2,
+	       starpu_data_handle_t s,
+	       unsigned nblocks,
+	       int use_reduction)
 {
+	int ret;
+
 	/* Blank the accumulation variable */
-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
+	if (ret == -ENODEV) return ret;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&dot_kernel_cl,
-			use_reduction?STARPU_REDUX:STARPU_RW, s,
-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
-			0);
+		ret = starpu_insert_task(&dot_kernel_cl,
+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
 /*
@@ -273,7 +289,7 @@ void dot_kernel(starpu_data_handle v1,
 static void scal_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -288,7 +304,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE alpha;
-	starpu_unpack_cl_args(cl_arg, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &alpha);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
@@ -297,16 +313,18 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 	SCAL(n, alpha, v1, 1);
 }
 
-static struct starpu_perfmodel_t scal_kernel_model = {
+static struct starpu_perfmodel scal_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_kernel"
 };
 
-static starpu_codelet scal_kernel_cl = {
+static struct starpu_codelet scal_kernel_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_kernel_cpu,
+	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_kernel_cuda,
+	.cuda_funcs = {scal_kernel_cuda, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &scal_kernel_model
@@ -328,7 +346,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	/* Compute v1 = alpha M v2 + beta v1 */
 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
@@ -347,7 +365,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
 
 	TYPE alpha, beta;
-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
 
 	int worker_size = starpu_combined_worker_get_size();
 
@@ -368,38 +386,43 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
 }
 
-static struct starpu_perfmodel_t gemv_kernel_model = {
+static struct starpu_perfmodel gemv_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "gemv_kernel"
 };
 
-static starpu_codelet gemv_kernel_cl = {
+static struct starpu_codelet gemv_kernel_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
-	.cpu_func = gemv_kernel_cpu,
+	.cpu_funcs = {gemv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = gemv_kernel_cuda,
+	.cuda_funcs = {gemv_kernel_cuda, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &gemv_kernel_model
 };
 
-void gemv_kernel(starpu_data_handle v1,
-		starpu_data_handle matrix,
-		starpu_data_handle v2,
+int gemv_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t matrix,
+		starpu_data_handle_t v2,
 		TYPE p1, TYPE p2,
 		unsigned nblocks,
 		int use_reduction)
 {
 	unsigned b1, b2;
+	int ret;
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		starpu_insert_task(&scal_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&scal_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -407,15 +430,17 @@ void gemv_kernel(starpu_data_handle v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			starpu_insert_task(&gemv_kernel_cl,
-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
-				STARPU_VALUE,	&one,	sizeof(one),
-				STARPU_VALUE,	&p2,	sizeof(p2),
-				0);
+			ret = starpu_insert_task(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+						 STARPU_VALUE,	&one,	sizeof(one),
+						 STARPU_VALUE,	&p2,	sizeof(p2),
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 		}
 	}
+	return 0;
 }
 
 /*
@@ -425,7 +450,7 @@ void gemv_kernel(starpu_data_handle v1,
 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -445,7 +470,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE p1, p2;
-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -460,35 +485,41 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p2, v2, 1, v1, 1);
 }
 
-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
+static struct starpu_perfmodel scal_axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "scal_axpy_kernel"
 };
 
-static starpu_codelet scal_axpy_kernel_cl = {
+static struct starpu_codelet scal_axpy_kernel_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = scal_axpy_kernel_cpu,
+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = scal_axpy_kernel_cuda,
+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &scal_axpy_kernel_model
 };
 
-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
-			starpu_data_handle v2, TYPE p2,
-			unsigned nblocks)
+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
+		     starpu_data_handle_t v2, TYPE p2,
+		     unsigned nblocks)
 {
+	int ret;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&scal_axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			STARPU_VALUE, &p2, sizeof(p2),
-			0);
+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 STARPU_VALUE, &p2, sizeof(p2),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
 
@@ -499,7 +530,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -516,7 +547,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 {
 	TYPE p1;
-	starpu_unpack_cl_args(cl_arg, &p1);
+	starpu_codelet_unpack_args(cl_arg, &p1);
 
 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -528,89 +559,46 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 	AXPY(nx, p1, v2, 1, v1, 1);
 }
 
-static struct starpu_perfmodel_t axpy_kernel_model = {
+static struct starpu_perfmodel axpy_kernel_model =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "axpy_kernel"
 };
 
-static starpu_codelet axpy_kernel_cl = {
+static struct starpu_codelet axpy_kernel_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = axpy_kernel_cpu,
+	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = axpy_kernel_cuda,
+	.cuda_funcs = {axpy_kernel_cuda, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &axpy_kernel_model
 };
 
-void axpy_kernel(starpu_data_handle v1,
-		starpu_data_handle v2, TYPE p1,
+int axpy_kernel(starpu_data_handle_t v1,
+		starpu_data_handle_t v2, TYPE p1,
 		unsigned nblocks)
 {
+	int ret;
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		starpu_insert_task(&axpy_kernel_cl,
-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
-			STARPU_VALUE, &p1, sizeof(p1),
-			0);
+		ret = starpu_insert_task(&axpy_kernel_cl,
+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_VALUE, &p1, sizeof(p1),
+					 0);
+		if (ret == -ENODEV) return ret;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 	}
+	return 0;
 }
 
-
-/*
- *	COPY kernel : vector_dst <- vector_src
- */
-
-static void copy_handle_cpu(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	memcpy(dst, src, nx*elemsize);
-}
-
-#ifdef STARPU_USE_CUDA
-static void copy_handle_cuda(void *descr[], void *cl_arg)
-{
-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
-	
-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
-
-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
-}
-#endif
-
-static struct starpu_perfmodel_t copy_handle_model = {
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "copy_handle"
-};
-
-static starpu_codelet copy_handle_cl = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = copy_handle_cpu,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = copy_handle_cuda,
-#endif
-	.nbuffers = 2,
-	.model = &copy_handle_model
-};
-
-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
-	{
-		starpu_insert_task(&copy_handle_cl,
-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
-			0);
-	}
-} 
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}

+ 45 - 26
examples/cholesky/cholesky.h

@@ -30,6 +30,7 @@
 
 #include <common/blas.h>
 #include <starpu.h>
+#include <starpu_bound.h>
 
 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
 #define NMAXBLOCKS	32
@@ -61,6 +62,7 @@ static unsigned nbigblocks = 8;
 static unsigned pinned = 0;
 static unsigned noprio = 0;
 static unsigned check = 0;
+static unsigned bound = 0;
 static unsigned with_ctxs = 0;
 static unsigned with_noctxs = 0;
 static unsigned chole1 = 0;
@@ -76,64 +78,81 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 #endif
 
-extern struct starpu_perfmodel_t chol_model_11;
-extern struct starpu_perfmodel_t chol_model_21;
-extern struct starpu_perfmodel_t chol_model_22;
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
 
 static void __attribute__((unused)) parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-with_ctxs") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-with_ctxs") == 0) 
+		{
 			with_ctxs = 1;
 			break;
 		}
-		if (strcmp(argv[i], "-with_noctxs") == 0) {
+		if (strcmp(argv[i], "-with_noctxs") == 0) 
+		{
 			with_noctxs = 1;
 			break;
 		}
 		
-		if (strcmp(argv[i], "-chole1") == 0) {
+		if (strcmp(argv[i], "-chole1") == 0) 
+		{
 			chole1 = 1;
 			break;
 		}
 
-		if (strcmp(argv[i], "-chole2") == 0) {
+		if (strcmp(argv[i], "-chole2") == 0) 
+		{
 			chole2 = 1;
 			break;
 		}
 
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
+		if (strcmp(argv[i], "-size") == 0)
+		{
+		        char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+		        char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-nbigblocks") == 0) {
-			char *argptr;
+
+		if (strcmp(argv[i], "-nbigblocks") == 0)
+		{
+		        char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
-		
-		if (strcmp(argv[i], "-pin") == 0) {
+
+		if (strcmp(argv[i], "-pin") == 0)
+		{
 			pinned = 1;
 		}
-		
-		if (strcmp(argv[i], "-no-prio") == 0) {
+
+		if (strcmp(argv[i], "-no-prio") == 0)
+		{
 			noprio = 1;
 		}
-		
-		if (strcmp(argv[i], "-check") == 0) {
+
+		if (strcmp(argv[i], "-bound") == 0)
+		{
+			bound = 1;
+		}
+
+		if (strcmp(argv[i], "-check") == 0)
+		{
 			check = 1;
 		}
-		
-		if (strcmp(argv[i], "-h") == 0) {
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
-		}	
-	}	
+		}
+	}
 }
 
 #endif /* __DW_CHOLESKY_H__ */

+ 100 - 60
examples/cholesky/cholesky_grain_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,91 +36,100 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &chol_model_11
 };
 
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
-	
+
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
 	task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &chol_model_21
 };
 
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
 {
+	int ret;
+
 	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
 
-	task->cl = &cl21;	
+	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
 
-	if (j == k+1) {
+	if (j == k+1)
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &chol_model_22
 };
 
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
 {
+	int ret;
+
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
@@ -128,44 +137,47 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 
 
 /*
- *	code to bootstrap the factorization 
+ *	code to bootstrap the factorization
  *	and construct the DAG
  */
 
 static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
 {
+	int ret;
+
 	/* create a new codelet */
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
@@ -173,12 +185,14 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -189,13 +203,16 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	{
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
-			starpu_task_submit(task);
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
-		
+
 		for (j = k+1; j<nblocks; j++)
 		{
 			create_task_21(dataA, k, j, reclevel);
@@ -209,7 +226,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	}
 
 	/* schedule the codelet */
-	int ret = starpu_task_submit(entry_task);
+	ret = starpu_task_submit(entry_task);
 	if (STARPU_UNLIKELY(ret == -ENODEV))
 	{
 		FPRINTF(stderr, "No worker may execute this task\n");
@@ -223,7 +240,8 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 		starpu_data_unpartition(dataA, 0);
 		return;
 	}
-	else {
+	else
+	{
 		STARPU_ASSERT(reclevel == 0);
 		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
 
@@ -253,20 +271,26 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 static void initialize_system(float **A, unsigned dim, unsigned pinned)
 {
-	starpu_init(NULL);
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	starpu_helper_cublas_init();
 
 	if (pinned)
 	{
 		starpu_malloc((void **)A, dim*dim*sizeof(float));
-	} 
-	else {
+	}
+	else
+	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
 }
 
-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
 {
 	struct timeval start;
 	struct timeval end;
@@ -284,6 +308,15 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 	double flop = (1.0f*size*size*size)/3.0f;
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 
+	if (pinned)
+	{
+		starpu_free(matA);
+	}
+	else
+	{
+		free(matA);
+	}
+
 	starpu_helper_cublas_shutdown();
 
 	starpu_shutdown();
@@ -321,10 +354,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
@@ -333,7 +368,7 @@ int main(int argc, char **argv)
 #endif
 
 
-	cholesky_grain(mat, size, size, nblocks, nbigblocks);
+	cholesky_grain(mat, size, size, nblocks, nbigblocks, pinned);
 
 #ifdef CHECK_OUTPUT
 	FPRINTF(stdout, "Results :\n");
@@ -342,10 +377,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -357,7 +394,7 @@ int main(int argc, char **argv)
 	float *test_mat = malloc(size*size*sizeof(float));
 	STARPU_ASSERT(test_mat);
 
-	SSYRK("L", "N", size, size, 1.0f, 
+	SSYRK("L", "N", size, size, 1.0f,
 				mat, size, 0.0f, test_mat, size);
 
 	FPRINTF(stderr, "comparing results ...\n");
@@ -365,15 +402,18 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
                                 FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+	free(test_mat);
 #endif
 
 	return 0;

+ 79 - 46
examples/cholesky/cholesky_implicit.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,45 +17,48 @@
  */
 
 #include "cholesky.h"
-#include "../sched_ctx_utils/sched_ctx_utils.h"
+
 /*
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &chol_model_11
 };
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &chol_model_21
 };
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
 	.max_parallelism = INT_MAX,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &chol_model_22
 };
 
@@ -69,8 +72,9 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 	cl22.type = STARPU_SPMD;
 }
 
-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
+	int ret;
 	struct timeval start;
 	struct timeval end;
 
@@ -80,46 +84,53 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 	gettimeofday(&start, NULL);
 
+	if (bound)
+		starpu_bound_start(0, 0);
 	/* create all the DAG nodes */
 	for (k = 0; k < nblocks; k++)
 	{
-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
+                starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
-                starpu_insert_task(&cl11,
-                                   STARPU_PRIORITY, prio_level,
-                                   STARPU_RW, sdatakk,
-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
-                                   0);
+                ret = starpu_insert_task(&cl11,
+					 STARPU_PRIORITY, prio_level,
+					 STARPU_RW, sdatakk,
+					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
+					 0);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 		for (j = k+1; j<nblocks; j++)
 		{
-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
 
-                        starpu_insert_task(&cl21,
-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
-                                           STARPU_R, sdatakk,
-                                           STARPU_RW, sdatakj,
-                                           0);
+                        ret = starpu_insert_task(&cl21,
+						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
+						 STARPU_R, sdatakk,
+						 STARPU_RW, sdatakj,
+						 0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
 
 			for (i = k+1; i<nblocks; i++)
 			{
 				if (i <= j)
                                 {
-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
-					
-					starpu_insert_task(&cl22,
-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
-                                                           STARPU_R, sdataki,
-                                                           STARPU_R, sdatakj,
-                                                           STARPU_RW, sdataij,
-                                                           0);
+					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
+					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+
+					ret = starpu_insert_task(&cl22,
+								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
+								 STARPU_R, sdataki,
+								 STARPU_R, sdatakj,
+								 STARPU_RW, sdataij,
+								 0);
+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
                                 }
 			}
 		}
 	}
 
 	starpu_task_wait_for_all();
+	if (bound)
+		starpu_bound_stop();
 
 	starpu_data_unpartition(dataA, 0);
 
@@ -139,23 +150,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 		FPRINTF(stdout, "%2.2f\n", timing/1000);
 	
 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+		if (bound)
+		{
+			double res;
+			starpu_bound_compute(&res, NULL, 0);
+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
+		}
 	}
 }
 
 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -163,6 +182,8 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
 	_cholesky(dataA, nblocks);
+
+	starpu_data_unregister(dataA);
 }
 
 static void execute_cholesky(unsigned size, unsigned nblocks)
@@ -188,16 +209,19 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
 #endif
+
 	cholesky(mat, size, size, nblocks);
 
 #ifdef PRINT_OUTPUT
@@ -206,10 +230,12 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -225,7 +251,8 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i > j) {
+				if (i > j)
+				{
 					mat[j+i*size] = 0.0f; /* debug */
 				}
 			}
@@ -242,10 +269,12 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i <= j) {
+				if (i <= j)
+				{
 					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 				}
-				else {
+				else
+				{
 					FPRINTF(stdout, ".\t");
 				}
 			}
@@ -257,17 +286,21 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		{
 			for (i = 0; i < size; i++)
 			{
-				if (i <= j) {
+				if (i <= j)
+				{
 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
 	                                float err = abs(test_mat[j +i*size] - orig);
-	                                if (err > 0.00001) {
+	                                if (err > 0.00001)
+					{
 	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }
 			}
 	        }
+		free(test_mat);
 	}
+	starpu_free(mat);
 
 }
 

+ 9 - 5
examples/cholesky/cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -55,7 +55,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
 				right, ld12, 1.0f, center, ld22);
 		}
-		else {
+		else
+		{
 			/* Parallel CPU kernel */
 			int rank = starpu_combined_worker_get_rank();
 
@@ -113,7 +114,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
@@ -157,7 +159,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
 	unsigned z;
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 
 			/*
@@ -188,7 +191,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 			int ret;
 			int info;
 			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
-			if (ret != MAGMA_SUCCESS) {
+			if (ret != MAGMA_SUCCESS)
+			{
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}

+ 34 - 27
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,8 +17,8 @@
  */
 
 /*
- * As a convention, in that file, descr[0] is represented by A,
- * 				  descr[1] is B ...
+ * As a convention, in that file, buffers[0] is represented by A,
+ * 				  buffers[1] is B ...
  */
 
 /*
@@ -26,6 +26,7 @@
  */
 
 #include <starpu.h>
+#include "cholesky.h"
 
 /* #define USE_PERTURBATION	1 */
 
@@ -35,11 +36,11 @@
 #define PERTURBATE(a)	(a)
 #endif
 
-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 
@@ -50,11 +51,11 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 
@@ -65,11 +66,11 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 
@@ -80,11 +81,11 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 
@@ -95,11 +96,11 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 
@@ -110,11 +111,11 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
 {
 	uint32_t n;
 
-	n = starpu_matrix_get_nx(descr[0].handle);
+	n = starpu_matrix_get_nx(task->handles[0]);
 
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 
@@ -125,28 +126,34 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 	return PERTURBATE(cost);
 }
 
-struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
+struct starpu_perfmodel chol_model_11 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 };
 
-struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
+struct starpu_perfmodel chol_model_21 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 };
 
-struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
+struct starpu_perfmodel chol_model_22 =
+{
+	.per_arch =
+	{
+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
 	},
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"

+ 96 - 57
examples/cholesky/cholesky_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -36,18 +36,19 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
 	.model = &chol_model_11
 };
 
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
 {
 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
 
@@ -56,76 +57,80 @@ static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
 	if (!noprio)
 		task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
 	.model = &chol_model_21
 };
 
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG21(k, j));
 
 	task->cl = &cl21;	
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
 
-	if (!noprio && (j == k+1)) {
+	if (!noprio && (j == k+1))
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
 	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
 
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
 	.model = &chol_model_22
 };
 
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
 {
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
@@ -134,27 +139,28 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if (!noprio && (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+
+	if (!noprio && (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
 	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
@@ -167,7 +173,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
  *	and construct the DAG
  */
 
-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
 	struct timeval start;
 	struct timeval end;
@@ -183,12 +189,15 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	{
 		struct starpu_task *task = create_task_11(dataA, k);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
+		else
+		{
 			int ret = starpu_task_submit(task);
-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+                        if (STARPU_UNLIKELY(ret == -ENODEV))
+			{
                                 FPRINTF(stderr, "No worker may execute this task\n");
                                 exit(0);
                         }
@@ -209,7 +218,8 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
 	/* schedule the codelet */
 	int ret = starpu_task_submit(entry_task);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
+        if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
                 FPRINTF(stderr, "No worker may execute this task\n");
                 exit(0);
         }
@@ -233,24 +243,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
-static void initialize_system(float **A, unsigned dim, unsigned pinned)
+static int initialize_system(float **A, unsigned dim, unsigned pinned)
 {
-	starpu_init(NULL);
-	
+	int ret;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
 	starpu_helper_cublas_init();
 
 	if (pinned)
 	{
 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
 	} 
-	else {
+	else
+	{
 		*A = malloc(dim*dim*sizeof(float));
 	}
+	return 0;
 }
 
-static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned pinned)
 {
-	starpu_data_handle dataA;
+	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :
 	 * one block is now determined by 2 unsigned (i,j) */
@@ -258,12 +275,14 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
-	struct starpu_data_filter f = {
+	struct starpu_data_filter f =
+	{
 		.filter_func = starpu_vertical_block_filter_func,
 		.nchildren = nblocks
 	};
 
-	struct starpu_data_filter f2 = {
+	struct starpu_data_filter f2 =
+	{
 		.filter_func = starpu_block_filter_func,
 		.nchildren = nblocks
 	};
@@ -272,6 +291,17 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	_cholesky(dataA, nblocks);
 
+	starpu_data_unregister(dataA);
+
+	if (pinned)
+	{
+		starpu_free(matA);
+	}
+	else
+	{
+		free(matA);
+	}
+
 	starpu_helper_cublas_shutdown();
 
 	starpu_shutdown();
@@ -289,7 +319,9 @@ int main(int argc, char **argv)
 	float *mat;
 
 	mat = malloc(size*size*sizeof(float));
-	initialize_system(&mat, size, pinned);
+	int ret = initialize_system(&mat, size, pinned);
+	if (ret)
+		return ret;
 
 	unsigned i,j;
 	for (i = 0; i < size; i++)
@@ -309,10 +341,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
@@ -321,7 +355,7 @@ int main(int argc, char **argv)
 #endif
 
 
-	cholesky(mat, size, size, nblocks);
+	cholesky(mat, size, size, nblocks, pinned);
 
 #ifdef CHECK_OUTPUT
 	FPRINTF(stdout, "Results :\n");
@@ -330,10 +364,12 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 				mat[j+i*size] = 0.0f; /* debug */
 			}
@@ -353,14 +389,17 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i <= j) {
+			if (i <= j)
+			{
 				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
 			}
-			else {
+			else
+			{
 				FPRINTF(stdout, ".\t");
 			}
 		}
 		FPRINTF(stdout, "\n");
+		free(test_mat);
 	}
 #endif
 

+ 75 - 54
examples/cholesky/cholesky_tile_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,7 +19,7 @@
 
 /* A [ y ] [ x ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
-starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
+starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 
 /*
  *	Some useful functions
@@ -39,12 +39,13 @@ static struct starpu_task *create_task(starpu_tag_t id)
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
+	.modes = { STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_POTRF
@@ -66,26 +67,27 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	task->cl = &cl11;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[k][k];
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = A_state[k][k];
 
 	/* this is an important task */
 	task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
 	}
 
 	return task;
 }
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
+	.modes = { STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_STRSM
@@ -100,37 +102,42 @@ static starpu_codelet cl21 =
 
 static void create_task_21(unsigned k, unsigned j)
 {
+	int ret;
+
 	struct starpu_task *task = create_task(TAG21(k, j));
 
 	task->cl = &cl21;	
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[k][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = A_state[j][k]; 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = A_state[k][k];
+	task->handles[1] = A_state[j][k];
 
-	if (j == k+1) {
+	if (j == k+1)
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 #ifdef STARPU_USE_GORDON
 #ifdef SPU_FUNC_SGEMM
@@ -145,6 +152,8 @@ static starpu_codelet cl22 =
 
 static void create_task_22(unsigned k, unsigned i, unsigned j)
 {
+	int ret;
+
 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
 
 	struct starpu_task *task = create_task(TAG22(k, i, j));
@@ -152,26 +161,27 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = A_state[i][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = A_state[j][k]; 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = A_state[j][i]; 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
+	task->handles[0] = A_state[i][k];
+	task->handles[1] = A_state[j][k];
+	task->handles[2] = A_state[j][i];
+
+	if ( (i == k + 1) && (j == k +1) )
+	{
 		task->priority = STARPU_MAX_PRIO;
 	}
 
 	/* enforce dependencies ... */
-	if (k > 0) {
+	if (k > 0)
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
 	}
-	else {
+	else
+	{
 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
 	}
 
-	starpu_task_submit(task);
+	ret = starpu_task_submit(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 }
 
 
@@ -183,6 +193,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
 static void cholesky_no_stride(void)
 {
+	int ret;
+
 	struct timeval start;
 	struct timeval end;
 
@@ -195,11 +207,14 @@ static void cholesky_no_stride(void)
 	{
 		struct starpu_task *task = create_task_11(k, nblocks);
 		/* we defer the launch of the first task */
-		if (k == 0) {
+		if (k == 0)
+		{
 			entry_task = task;
 		}
-		else {
-			starpu_task_submit(task);
+		else
+		{
+			ret = starpu_task_submit(task);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		
 		for (j = k+1; j<nblocks; j++)
@@ -216,7 +231,8 @@ static void cholesky_no_stride(void)
 
 	/* schedule the codelet */
 	gettimeofday(&start, NULL);
-	starpu_task_submit(entry_task);
+	ret = starpu_task_submit(entry_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
@@ -235,13 +251,17 @@ int main(int argc, char **argv)
 {
 	unsigned x, y;
 	unsigned i, j;
+	int ret;
 
 	parse_args(argc, argv);
 	assert(nblocks <= NMAXBLOCKS);
 
 	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
 
-	starpu_init(NULL);
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
@@ -251,17 +271,8 @@ int main(int argc, char **argv)
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
-		if (x <= y) {
-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
-			assert(A[y][x]);
-		}
-	}
-
-
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	{
-		if (x <= y) {
+		if (x <= y)
+		{
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
 #else
@@ -277,7 +288,8 @@ int main(int argc, char **argv)
 	 * */
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
-	if (x <= y) {
+	if (x <= y)
+	{
 		for (i = 0; i < BLOCKSIZE; i++)
 		for (j = 0; j < BLOCKSIZE; j++)
 		{
@@ -290,12 +302,11 @@ int main(int argc, char **argv)
 		}
 	}
 
-
-
 	for (y = 0; y < nblocks; y++)
 	for (x = 0; x < nblocks; x++)
 	{
-		if (x <= y) {
+		if (x <= y)
+		{
 			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
 				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
 		}
@@ -303,6 +314,16 @@ int main(int argc, char **argv)
 
 	cholesky_no_stride();
 
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y)
+		{
+			starpu_data_unregister(A_state[y][x]);
+			free(A[y][x]);
+		}
+	}
+
 	starpu_helper_cublas_shutdown();
 
 	starpu_shutdown();

+ 0 - 0
examples/cholesky_2ctxs/cholesky/.dirstamp


+ 0 - 154
examples/cholesky_2ctxs/cholesky/cholesky.h

@@ -1,154 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __DW_CHOLESKY_H__
-#define __DW_CHOLESKY_H__
-
-#include <limits.h>
-#include <string.h>
-#include <math.h>
-#include <sys/time.h>
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas.h>
-#endif
-
-#include <common/blas.h>
-#include <starpu.h>
-
-#define NMAXBLOCKS	32
-
-#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
-#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(j))))
-#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
-					| ((unsigned long long)(i)<<16)	\
-					| (unsigned long long)(j))))
-
-#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
-#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
-					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
-					| (unsigned long long)(j))))
-#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
-					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
-					| ((unsigned long long)(i)<<16) 			\
-					| (unsigned long long)(j))))
-
-#define BLOCKSIZE	(size/nblocks)
-
-#define BLAS3_FLOP(n1,n2,n3)    \
-        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
-
-//static unsigned size = 4*1024;
-//static unsigned nblocks = 16;
-static unsigned nbigblocks = 8;
-static unsigned pinned = 0;
-static unsigned noprio = 0;
-static unsigned check = 0;
-
-void chol_cpu_codelet_update_u11(void **, void *);
-void chol_cpu_codelet_update_u21(void **, void *);
-void chol_cpu_codelet_update_u22(void **, void *);
-
-#ifdef STARPU_USE_CUDA
-void chol_cublas_codelet_update_u11(void *descr[], void *_args);
-void chol_cublas_codelet_update_u21(void *descr[], void *_args);
-void chol_cublas_codelet_update_u22(void *descr[], void *_args);
-#endif
-
-double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
-
-extern struct starpu_perfmodel_t chol_model_11;
-extern struct starpu_perfmodel_t chol_model_21;
-extern struct starpu_perfmodel_t chol_model_22;
-
-static void __attribute__((unused)) parse_args(int argc, char **argv, unsigned *size, unsigned *nblocks)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-		        char *argptr;
-			(*size) = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocks") == 0) {
-		        char *argptr;
-			(*nblocks) = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nbigblocks") == 0) {
-		        char *argptr;
-			nbigblocks = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-pin") == 0) {
-			pinned = 1;
-		}
-
-		if (strcmp(argv[i], "-no-prio") == 0) {
-			noprio = 1;
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-h") == 0) {
-			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
-		}
-	}
-}
-
-static void __attribute__((unused)) parse_args_ctx(int start, int argc, char **argv, unsigned *size, unsigned *nblocks)
-{
-	int i;
-	for (i = start; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-		        char *argptr;
-			(*size) = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocks") == 0) {
-		        char *argptr;
-			(*nblocks) = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nbigblocks") == 0) {
-		        char *argptr;
-			nbigblocks = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-pin") == 0) {
-			pinned = 1;
-		}
-
-		if (strcmp(argv[i], "-no-prio") == 0) {
-			noprio = 1;
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-h") == 0) {
-			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
-		}
-	}
-}
-
-#endif // __DW_CHOLESKY_H__

+ 0 - 382
examples/cholesky_2ctxs/cholesky/cholesky_grain_tag.c

@@ -1,382 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky.h"
-
-/*
- *	Some useful functions
- */
-
-static struct starpu_task *create_task(starpu_tag_t id)
-{
-	struct starpu_task *task = starpu_task_create();
-		task->cl_arg = NULL;
-		task->use_tag = 1;
-		task->tag_id = id;
-
-	return task;
-}
-
-/*
- *	Create the codelets
- */
-
-static starpu_codelet cl11 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
-{
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
-
-	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
-	
-	task->cl = &cl11;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
-
-	/* this is an important task */
-	task->priority = STARPU_MAX_PRIO;
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
-	}
-
-	return task;
-}
-
-static starpu_codelet cl21 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
-{
-	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
-
-	task->cl = &cl21;	
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
-
-	if (j == k+1) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
-	}
-	else {
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
-	}
-
-	starpu_task_submit_to_ctx(task, sched_ctx);
-}
-
-static starpu_codelet cl22 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
-{
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
-
-	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
-
-	task->cl = &cl22;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
-	}
-	else {
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
-	}
-
-	starpu_task_submit_to_ctx(task, sched_ctx);
-}
-
-
-
-/*
- *	code to bootstrap the factorization 
- *	and construct the DAG
- */
-
-static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
-{
-	/* create a new codelet */
-	struct starpu_task *entry_task = NULL;
-
-	/* create all the DAG nodes */
-	unsigned i,j,k;
-
-	starpu_data_handle dataA;
-
-	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
-
-	starpu_data_set_sequential_consistency_flag(dataA, 0);
-
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
-
-	starpu_data_map_filters(dataA, 2, &f, &f2);
-
-	for (k = 0; k < nbigblocks; k++)
-	{
-		struct starpu_task *task = create_task_11(dataA, k, reclevel);
-		/* we defer the launch of the first task */
-		if (k == 0) {
-			entry_task = task;
-		}
-		else {
-		  starpu_task_submit_to_ctx(task, sched_ctx);
-		}
-		
-		for (j = k+1; j<nblocks; j++)
-		{
-		  create_task_21(dataA, k, j, reclevel, sched_ctx);
-
-			for (i = k+1; i<nblocks; i++)
-			{
-				if (i <= j)
-				  create_task_22(dataA, k, i, j, reclevel, sched_ctx);
-			}
-		}
-	}
-
-	/* schedule the codelet */
-	int ret = starpu_task_submit_to_ctx(entry_task, sched_ctx);
-	if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-		fprintf(stderr, "No worker may execute this task\n");
-		exit(-1);
-	}
-
-	if (nblocks == nbigblocks)
-	{
-		/* stall the application until the end of computations */
-		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
-		starpu_data_unpartition(dataA, 0);
-		return;
-	}
-	else {
-		STARPU_ASSERT(reclevel == 0);
-		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
-
-		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
-		STARPU_ASSERT(tag_array);
-
-		unsigned ind = 0;
-		for (i = nbigblocks; i < nblocks; i++)
-		for (j = nbigblocks; j < nblocks; j++)
-		{
-			if (i <= j)
-				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
-		}
-
-		starpu_tag_wait_array(ind, tag_array);
-
-		free(tag_array);
-
-		starpu_data_unpartition(dataA, 0);
-		starpu_data_unregister(dataA);
-
-		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
-
-		cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1, sched_ctx);
-	}
-}
-
-static void initialize_system(float **A, unsigned dim, unsigned pinned)
-{
-  //	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-	if (pinned)
-	{
-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
-	} 
-	else {
-		*A = malloc(dim*dim*sizeof(float));
-	}
-}
-
-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, struct starpu_sched_ctx *sched_ctx)
-{
-	struct timeval start;
-	struct timeval end;
-
-	gettimeofday(&start, NULL);
-
-	cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0, sched_ctx);
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
-
-	double flop = (1.0f*size*size*size)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-
-	starpu_helper_cublas_shutdown();
-
-	//	starpu_shutdown();
-}
-
-int run_cholesky_grain_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
-{
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	float *mat;
-
-	mat = malloc(size*size*sizeof(float));
-	initialize_system(&mat, size, pinned);
-
-	unsigned i,j;
-	for (i = 0; i < size; i++)
-	{
-		for (j = 0; j < size; j++)
-		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
-		}
-	}
-
-
-#ifdef CHECK_OUTPUT
-	printf("Input :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-
-	cholesky_grain(mat, size, size, nblocks, nbigblocks, sched_ctx);
-
-#ifdef CHECK_OUTPUT
-	printf("Results :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
-			}
-		}
-		printf("\n");
-	}
-
-	fprintf(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size*size*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	SSYRK("L", "N", size, size, 1.0f, 
-				mat, size, 0.0f, test_mat, size);
-
-	fprintf(stderr, "comparing results ...\n");
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-	return 0;
-}

+ 0 - 286
examples/cholesky_2ctxs/cholesky/cholesky_implicit.c

@@ -1,286 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  INRIA
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky.h"
-
-/*
- *	Create the codelets
- */
-
-static starpu_codelet cl11 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
-static starpu_codelet cl21 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
-static starpu_codelet cl22 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.max_parallelism = INT_MAX,
-	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
-/*
- *	code to bootstrap the factorization
- *	and construct the DAG
- */
-
-static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
-{
-	cl22.type = STARPU_SPMD;
-}
-
-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, double *timing)
-{
-	struct timeval start;
-	struct timeval end;
-
-	unsigned i,j,k;
-
-	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
-
-	gettimeofday(&start, NULL);
-
-	/* create all the DAG nodes */
-	for (k = 0; k < nblocks; k++)
-	{
-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
-
-		starpu_insert_task(&cl11,
-				   STARPU_PRIORITY, prio_level,
-				   STARPU_RW, sdatakk,
-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
-				   0);
-
-		for (j = k+1; j<nblocks; j++)
-		{
-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
-			starpu_insert_task(&cl21,
-					   STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
-					   STARPU_R, sdatakk,
-					   STARPU_RW, sdatakj,
-					   0);
-
-			for (i = k+1; i<nblocks; i++)
-			{
-				if (i <= j)
-                                {
-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
-					
-					starpu_insert_task(&cl22,
-							   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
-							   STARPU_R, sdataki,
-							   STARPU_R, sdatakj,
-							   STARPU_RW, sdataij,
-							   0);
-                                }
-			}
-		}
-	}
-
-	starpu_task_wait_for_all();
-		
-	starpu_data_unpartition(dataA, 0);
-
-	gettimeofday(&end, NULL);
-
-	(*timing) = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	unsigned long n = starpu_matrix_get_nx(dataA);
-
-	double flop = (1.0f*n*n*n)/3.0f;
-
-	double gflops = (flop/(*timing)/1000.0f);
-	(*timing) /= 1000000.0f; //sec
-	//	(*timing) /= 60.0f; //min
-	return gflops;
-}
-
-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, double *timing)
-{
-	starpu_data_handle dataA;
-
-	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
-
-	struct starpu_data_filter f = {
-		.filter_func = starpu_vertical_block_filter_func,
-		.nchildren = nblocks
-	};
-
-	struct starpu_data_filter f2 = {
-		.filter_func = starpu_block_filter_func,
-		.nchildren = nblocks
-	};
-
-	starpu_data_map_filters(dataA, 2, &f, &f2);
-	double gflops = _cholesky(dataA, nblocks, timing);
-	starpu_data_unregister(dataA);
-	return gflops;
-}
-
-double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
-{
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	unsigned size = 4 * 1024;
-	unsigned nblocks = 16;
-	parse_args_ctx(start, argc, argv, &size, &nblocks);
-
-	//	starpu_init(NULL);
-
-	//	starpu_helper_cublas_init();
-
-	float *mat;
-
-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
-
-	unsigned i,j;
-	for (i = 0; i < size; i++)
-	{
-		for (j = 0; j < size; j++)
-		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
-		}
-	}
-
-//#define PRINT_OUTPUT
-#ifdef PRINT_OUTPUT
-	printf("Input :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-	double gflops = cholesky(mat, size, size, nblocks, timing);
-
-#ifdef PRINT_OUTPUT
-	printf("Results :\n");
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-	if (check)
-	{
-		fprintf(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i > j) {
-					mat[j+i*size] = 0.0f; // debug
-				}
-			}
-		}
-		float *test_mat = malloc(size*size*sizeof(float));
-		STARPU_ASSERT(test_mat);
-	
-		SSYRK("L", "N", size, size, 1.0f,
-					mat, size, 0.0f, test_mat, size);
-	
-		fprintf(stderr, "comparing results ...\n");
-#ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i <= j) {
-					printf("%2.2f\t", test_mat[j +i*size]);
-				}
-				else {
-					printf(".\t");
-				}
-			}
-			printf("\n");
-		}
-#endif
-	
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i <= j) {
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = abs(test_mat[j +i*size] - orig);
-	                                if (err > 0.00001) {
-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
-	                                        assert(0);
-	                                }
-	                        }
-			}
-	        }
-	}
-	starpu_free((void *)mat);
-	//	starpu_helper_cublas_shutdown();
-	//	starpu_shutdown();
-
-	return gflops;
-}

+ 0 - 280
examples/cholesky_2ctxs/cholesky/cholesky_implicit_all_machine.c

@@ -1,280 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky.h"
-
-/*
- *	Create the codelets
- */
-
-static starpu_codelet cl11 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
-static starpu_codelet cl21 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
-static starpu_codelet cl22 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.type = STARPU_SEQ,
-	.max_parallelism = INT_MAX,
-	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
-/*
- *	code to bootstrap the factorization
- *	and construct the DAG
- */
-
-static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
-{
-	cl22.type = STARPU_SPMD;
-}
-
-static double _cholesky(starpu_data_handle dataA, unsigned nblocks)
-{
-	struct timeval start;
-	struct timeval end;
-
-	unsigned i,j,k;
-
-	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
-
-	gettimeofday(&start, NULL);
-
-	/* create all the DAG nodes */
-	for (k = 0; k < nblocks; k++)
-	{
-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
-
-                starpu_insert_task(&cl11,
-                                   STARPU_PRIORITY, prio_level,
-                                   STARPU_RW, sdatakk,
-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
-                                   0);
-
-		for (j = k+1; j<nblocks; j++)
-		{
-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
-
-                        starpu_insert_task(&cl21,
-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
-                                           STARPU_R, sdatakk,
-                                           STARPU_RW, sdatakj,
-                                           0);
-
-			for (i = k+1; i<nblocks; i++)
-			{
-				if (i <= j)
-                                {
-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
-					
-					starpu_insert_task(&cl22,
-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
-                                                           STARPU_R, sdataki,
-                                                           STARPU_R, sdatakj,
-                                                           STARPU_RW, sdataij,
-                                                           0);
-                                }
-			}
-		}
-	}
-
-	starpu_task_wait_for_all();
-
-	starpu_data_unpartition(dataA, 0);
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	unsigned long n = starpu_matrix_get_nx(dataA);
-
-	double flop = (1.0f*n*n*n)/3.0f;
-	return (flop/timing/1000.0f);
-}
-
-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
-{
-	starpu_data_handle dataA;
-
-	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
-
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
-
-	starpu_data_map_filters(dataA, 2, &f, &f2);
-
-	return _cholesky(dataA, nblocks);
-}
-
-double run_cholesky_implicit_all_machine(int argc, char **argv)
-{
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	//	starpu_init(NULL);
-
-	//	starpu_helper_cublas_init();
-
-	float *mat;
-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
-
-	unsigned i,j;
-	for (i = 0; i < size; i++)
-	{
-		for (j = 0; j < size; j++)
-		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
-		}
-	}
-
-//#define PRINT_OUTPUT
-#ifdef PRINT_OUTPUT
-	printf("Input :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-	double gflops = cholesky(mat, size, size, nblocks);
-
-#ifdef PRINT_OUTPUT
-	printf("Results :\n");
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-	if (check)
-	{
-		fprintf(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i > j) {
-					mat[j+i*size] = 0.0f; // debug
-				}
-			}
-		}
-		float *test_mat = malloc(size*size*sizeof(float));
-		STARPU_ASSERT(test_mat);
-	
-		SSYRK("L", "N", size, size, 1.0f,
-					mat, size, 0.0f, test_mat, size);
-	
-		fprintf(stderr, "comparing results ...\n");
-#ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i <= j) {
-					printf("%2.2f\t", test_mat[j +i*size]);
-				}
-				else {
-					printf(".\t");
-				}
-			}
-			printf("\n");
-		}
-#endif
-	
-		for (j = 0; j < size; j++)
-		{
-			for (i = 0; i < size; i++)
-			{
-				if (i <= j) {
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = abs(test_mat[j +i*size] - orig);
-	                                if (err > 0.00001) {
-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
-	                                        assert(0);
-	                                }
-	                        }
-			}
-	        }
-	}
-
-	//	starpu_helper_cublas_shutdown();
-	//	starpu_shutdown();
-
-	return gflops;
-}

+ 0 - 230
examples/cholesky_2ctxs/cholesky/cholesky_kernels.c

@@ -1,230 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu_config.h>
-#include "cholesky.h"
-#include "../../common/blas.h"
-#ifdef STARPU_USE_CUDA
-#include <starpu_cuda.h>
-#endif
-
-/*
- *   U22 
- */
-
-static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
-{
-	//printf("22\n");
-	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
-
-	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
-	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
-	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
-
-	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
-	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
-	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
-
-	if (s == 0)
-	{
-		int worker_size = starpu_combined_worker_get_size();
-
-		if (worker_size == 1)
-		{
-			/* Sequential CPU kernel */
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
-				right, ld12, 1.0f, center, ld22);
-		}
-		else {
-			/* Parallel CPU kernel */
-			int rank = starpu_combined_worker_get_rank();
-
-			int block_size = (dx + worker_size - 1)/worker_size;
-			int new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
-			
-			float *new_left = &left[block_size*rank];
-			float *new_center = &center[block_size*rank];
-
-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
-				right, ld12, 1.0f, new_center, ld22);
-		}
-	}
-	else
-	{
-		/* CUDA kernel */
-#ifdef STARPU_USE_CUDA
-		cublasSgemm('n', 't', dy, dx, dz, 
-				-1.0f, left, ld21, right, ld12, 
-				 1.0f, center, ld22);
-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
-#endif
-
-	}
-}
-
-void chol_cpu_codelet_update_u22(void *descr[], void *_args)
-{
-	chol_common_cpu_codelet_update_u22(descr, 0, _args);
-}
-
-#ifdef STARPU_USE_CUDA
-void chol_cublas_codelet_update_u22(void *descr[], void *_args)
-{
-	chol_common_cpu_codelet_update_u22(descr, 1, _args);
-}
-#endif// STARPU_USE_CUDA
-
-/* 
- * U21
- */
-
-static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
-{
-//	printf("21\n");
-	float *sub11;
-	float *sub21;
-
-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
-
-	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
-	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
-
-	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
-	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
-
-	switch (s) {
-		case 0:
-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-void chol_cpu_codelet_update_u21(void *descr[], void *_args)
-{
-	 chol_common_codelet_update_u21(descr, 0, _args);
-}
-
-#ifdef STARPU_USE_CUDA
-void chol_cublas_codelet_update_u21(void *descr[], void *_args)
-{
-	chol_common_codelet_update_u21(descr, 1, _args);
-}
-#endif 
-
-/*
- *	U11
- */
-
-static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
-{
-//	printf("11\n");
-	float *sub11;
-
-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
-
-	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
-	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
-
-	unsigned z;
-
-	switch (s) {
-		case 0:
-
-			/*
-			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
-			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
-			 *	- A22 <- A22 - l21 trans(l21)
-			 */
-
-			for (z = 0; z < nx; z++)
-			{
-				float lambda11;
-				lambda11 = sqrt(sub11[z+z*ld]);
-				sub11[z+z*ld] = lambda11;
-
-				STARPU_ASSERT(lambda11 != 0.0f);
-		
-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
-		
-				SSYR("L", nx - z - 1, -1.0f, 
-							&sub11[(z+1)+z*ld], 1,
-							&sub11[(z+1)+(z+1)*ld], ld);
-			}
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			{
-			float *lambda11;
-			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
-
-			for (z = 0; z < nx; z++)
-			{
-
-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
-				STARPU_ASSERT(*lambda11 != 0.0f);
-				
-				*lambda11 = sqrt(*lambda11);
-
-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
-
-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
-
-				cublasSsyr('U', nx - z - 1, -1.0f,
-							&sub11[(z+1)+z*ld], 1,
-							&sub11[(z+1)+(z+1)*ld], ld);
-			}
-
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
-			cudaFreeHost(lambda11);
-			}
-		
-
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-
-void chol_cpu_codelet_update_u11(void *descr[], void *_args)
-{
-	chol_common_codelet_update_u11(descr, 0, _args);
-}
-
-#ifdef STARPU_USE_CUDA
-void chol_cublas_codelet_update_u11(void *descr[], void *_args)
-{
-	chol_common_codelet_update_u11(descr, 1, _args);
-}
-#endif// STARPU_USE_CUDA

+ 0 - 153
examples/cholesky_2ctxs/cholesky/cholesky_models.c

@@ -1,153 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Télécom-SudParis
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*
- * As a convention, in that file, descr[0] is represented by A,
- * 				  descr[1] is B ...
- */
-
-/*
- *	Number of flops of Gemm 
- */
-
-#include <starpu.h>
-
-/* #define USE_PERTURBATION	1 */
-
-#ifdef USE_PERTURBATION
-#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
-#else
-#define PERTURBATE(a)	(a)
-#endif
-
-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
-
-#ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-struct starpu_perfmodel_t chol_model_11 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_11"
-};
-
-struct starpu_perfmodel_t chol_model_21 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_21"
-};
-
-struct starpu_perfmodel_t chol_model_22 = {
-	.per_arch = {
-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "chol_model_22"
-};

+ 0 - 370
examples/cholesky_2ctxs/cholesky/cholesky_tag.c

@@ -1,370 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky.h"
-
-/*
- *	Some useful functions
- */
-
-static struct starpu_task *create_task(starpu_tag_t id)
-{
-	struct starpu_task *task = starpu_task_create();
-		task->cl_arg = NULL;
-		task->use_tag = 1;
-		task->tag_id = id;
-
-	return task;
-}
-
-/*
- *	Create the codelets
- */
-
-static starpu_codelet cl11 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
-{
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
-
-	struct starpu_task *task = create_task(TAG11(k));
-	
-	task->cl = &cl11;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->buffers[0].mode = STARPU_RW;
-
-	/* this is an important task */
-	if (!noprio)
-		task->priority = STARPU_MAX_PRIO;
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
-	}
-
-	return task;
-}
-
-static starpu_codelet cl21 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, struct starpu_sched_ctx *sched_ctx)
-{
-	struct starpu_task *task = create_task(TAG21(k, j));
-
-	task->cl = &cl21;	
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_RW;
-
-	if (!noprio && (j == k+1)) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
-	}
-	else {
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
-	}
-
-	int ret = starpu_task_submit_to_ctx(task, sched_ctx);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-
-}
-
-static starpu_codelet cl22 =
-{
-	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, struct starpu_sched_ctx *sched_ctx)
-{
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
-
-	struct starpu_task *task = create_task(TAG22(k, i, j));
-
-	task->cl = &cl22;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
-	task->buffers[2].mode = STARPU_RW;
-
-	if (!noprio && (i == k + 1) && (j == k +1) ) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
-	}
-	else {
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
-	}
-
-	int ret = starpu_task_submit_to_ctx(task, sched_ctx);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-}
-
-
-
-/*
- *	code to bootstrap the factorization 
- *	and construct the DAG
- */
-
-static void _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
-{
-	struct timeval start;
-	struct timeval end;
-
-	struct starpu_task *entry_task = NULL;
-
-	/* create all the DAG nodes */
-	unsigned i,j,k;
-
-	gettimeofday(&start, NULL);
-
-	for (k = 0; k < nblocks; k++)
-	{
-		struct starpu_task *task = create_task_11(dataA, k);
-		/* we defer the launch of the first task */
-		if (k == 0) {
-			entry_task = task;
-		}
-		else {
-		  int ret = starpu_task_submit_to_ctx(task, sched_ctx);
-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                                fprintf(stderr, "No worker may execute this task\n");
-                                exit(0);
-                        }
-
-		}
-		
-		for (j = k+1; j<nblocks; j++)
-		{
-		  create_task_21(dataA, k, j, sched_ctx);
-
-			for (i = k+1; i<nblocks; i++)
-			{
-				if (i <= j)
-				  create_task_22(dataA, k, i, j, sched_ctx);
-			}
-		}
-	}
-
-	/* schedule the codelet */
-	int ret = starpu_task_submit_to_ctx(entry_task, sched_ctx);
-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
-                fprintf(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-
-
-	/* stall the application until the end of computations */
-	starpu_tag_wait(TAG11(nblocks-1));
-
-	starpu_data_unpartition(dataA, 0);
-
-	gettimeofday(&end, NULL);
-
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
-
-	unsigned n = starpu_matrix_get_nx(dataA);
-
-	double flop = (1.0f*n*n*n)/3.0f;
-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-}
-
-static void initialize_system(float **A, unsigned dim, unsigned pinned)
-{
-  //	starpu_init(NULL);
-	
-	starpu_helper_cublas_init();
-
-	if (pinned)
-	{
-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
-	} 
-	else {
-		*A = malloc(dim*dim*sizeof(float));
-	}
-}
-
-static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
-{
-	starpu_data_handle dataA;
-
-	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
-
-	starpu_data_set_sequential_consistency_flag(dataA, 0);
-
-	struct starpu_data_filter f;
-		f.filter_func = starpu_vertical_block_filter_func;
-		f.nchildren = nblocks;
-		f.get_nchildren = NULL;
-		f.get_child_ops = NULL;
-
-	struct starpu_data_filter f2;
-		f2.filter_func = starpu_block_filter_func;
-		f2.nchildren = nblocks;
-		f2.get_nchildren = NULL;
-		f2.get_child_ops = NULL;
-
-	starpu_data_map_filters(dataA, 2, &f, &f2);
-
-	_cholesky(dataA, nblocks, sched_ctx);
-
-	starpu_helper_cublas_shutdown();
-
-	//	starpu_shutdown();
-}
-
-int run_cholesky_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
-{
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
-	 * */
-
-	parse_args(argc, argv);
-
-	float *mat;
-
-	mat = malloc(size*size*sizeof(float));
-	initialize_system(&mat, size, pinned);
-
-	unsigned i,j;
-	for (i = 0; i < size; i++)
-	{
-		for (j = 0; j < size; j++)
-		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
-		}
-	}
-
-
-#ifdef CHECK_OUTPUT
-	printf("Input :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-
-	cholesky(mat, size, size, nblocks, sched_ctx);
-
-#ifdef CHECK_OUTPUT
-	printf("Results :\n");
-
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-				mat[j+i*size] = 0.0f; // debug
-			}
-		}
-		printf("\n");
-	}
-
-	fprintf(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size*size*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	SSYRK("L", "N", size, size, 1.0f, 
-				mat, size, 0.0f, test_mat, size);
-
-	fprintf(stderr, "comparing results ...\n");
-	for (j = 0; j < size; j++)
-	{
-		for (i = 0; i < size; i++)
-		{
-			if (i <= j) {
-				printf("%2.2f\t", test_mat[j +i*size]);
-			}
-			else {
-				printf(".\t");
-			}
-		}
-		printf("\n");
-	}
-#endif
-
-	return 0;
-}

+ 0 - 307
examples/cholesky_2ctxs/cholesky/cholesky_tile_tag.c

@@ -1,307 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky.h"
-
-/*
- *	Some useful functions
- */
-
-/* A [ y ] [ x ] */
-float *ch_A[NMAXBLOCKS][NMAXBLOCKS];
-starpu_data_handle ch_A_state[NMAXBLOCKS][NMAXBLOCKS];
-
-static struct starpu_task *create_task(starpu_tag_t id)
-{
-	struct starpu_task *task = starpu_task_create();
-		task->cl_arg = NULL;
-		task->use_tag = 1;
-		task->tag_id = id;
-
-	return task;
-}
-
-/*
- *	Create the codelets
- */
-
-static starpu_codelet cl11 =
-{
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u11,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_POTRF
-	.gordon_func = SPU_FUNC_POTRF,
-#else
-#warning SPU_FUNC_POTRF is not available
-#endif
-#endif
-	.nbuffers = 1,
-	.model = &chol_model_11
-};
-
-static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
-{
-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
-
-	struct starpu_task *task = create_task(TAG11(k));
-	
-	task->cl = &cl11;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = ch_A_state[k][k];
-	task->buffers[0].mode = STARPU_RW;
-
-	/* this is an important task */
-	task->priority = STARPU_MAX_PRIO;
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
-	}
-
-	return task;
-}
-
-static starpu_codelet cl21 =
-{
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u21,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_STRSM
-	.gordon_func = SPU_FUNC_STRSM,
-#else
-#warning SPU_FUNC_STRSM is not available
-#endif
-#endif
-	.nbuffers = 2,
-	.model = &chol_model_21
-};
-
-static void create_task_21(unsigned k, unsigned j, struct starpu_sched_ctx *sched_ctx)
-{
-	struct starpu_task *task = create_task(TAG21(k, j));
-
-	task->cl = &cl21;	
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = ch_A_state[k][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = ch_A_state[j][k]; 
-	task->buffers[1].mode = STARPU_RW;
-
-	if (j == k+1) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
-	}
-	else {
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
-	}
-
-	starpu_task_submit_to_ctx(task, sched_ctx);
-}
-
-static starpu_codelet cl22 =
-{
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
-	.cpu_func = chol_cpu_codelet_update_u22,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
-#endif
-#ifdef STARPU_USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-	.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
-#endif
-	.nbuffers = 3,
-	.model = &chol_model_22
-};
-
-static void create_task_22(unsigned k, unsigned i, unsigned j, struct starpu_sched_ctx *sched_ctx)
-{
-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
-
-	struct starpu_task *task = create_task(TAG22(k, i, j));
-
-	task->cl = &cl22;
-
-	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = ch_A_state[i][k]; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = ch_A_state[j][k]; 
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = ch_A_state[j][i]; 
-	task->buffers[2].mode = STARPU_RW;
-
-	if ( (i == k + 1) && (j == k +1) ) {
-		task->priority = STARPU_MAX_PRIO;
-	}
-
-	/* enforce dependencies ... */
-	if (k > 0) {
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
-	}
-	else {
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
-	}
-
-	starpu_task_submit_to_ctx(task, sched_ctx);
-}
-
-
-
-/*
- *	code to bootstrap the factorization 
- *	and construct the DAG
- */
-
-static double cholesky_no_stride(struct starpu_sched_ctx *sched_ctx)
-{
-	struct timeval start;
-	struct timeval end;
-
-	struct starpu_task *entry_task = NULL;
-
-	/* create all the DAG nodes */
-	unsigned i,j,k;
-
-	for (k = 0; k < nblocks; k++)
-	{
-	  struct starpu_task *task = create_task_11(k, nblocks);
-		/* we defer the launch of the first task */
-		if (k == 0) {
-			entry_task = task;
-		}
-		else {
-		  starpu_task_submit_to_ctx(task, sched_ctx);
-		}
-		
-		for (j = k+1; j<nblocks; j++)
-		{
-		  create_task_21(k, j, sched_ctx);
-
-			for (i = k+1; i<nblocks; i++)
-			{
-				if (i <= j)
-				  create_task_22(k, i, j, sched_ctx);
-			}
-		}
-	}
-
-	/* schedule the codelet */
-	gettimeofday(&start, NULL);
-		
-	starpu_task_submit_to_ctx(entry_task, sched_ctx);
-
-	/* stall the application until the end of computations */
-	starpu_tag_wait(TAG11(nblocks-1));
-	printf("cholesky finish wait for %d blocks \n", nblocks - 1);
-
-        gettimeofday(&end, NULL);
-
-        double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-        double flop = (1.0f*size*size*size)/3.0f;
-	return flop/timing/1000.0f;
-}
-
-double run_cholesky_tile_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
-{
-	unsigned x, y;
-	unsigned i, j;
-
-	parse_args(argc, argv);
-	assert(nblocks <= NMAXBLOCKS);
-
-	//	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
-
-	//	starpu_init(NULL);
-
-	/* Disable sequential consistency */
-	starpu_data_set_default_sequential_consistency_flag(0);
-
-	//	starpu_helper_cublas_init();
-
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	{
-		if (x <= y) {
-			ch_A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
-			assert(ch_A[y][x]);
-		}
-	}
-
-
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	{
-		if (x <= y) {
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-			posix_memalign((void **)&ch_A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
-#else
-			ch_A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
-#endif
-			assert(ch_A[y][x]);
-		}
-	}
-
-	/* create a simple definite positive symetric matrix example
-	 *
-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
-	 * */
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	if (x <= y) {
-		for (i = 0; i < BLOCKSIZE; i++)
-		for (j = 0; j < BLOCKSIZE; j++)
-		{
-			ch_A[y][x][i*BLOCKSIZE + j] =
-				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
-
-			/* make it a little more numerically stable ... ;) */
-			if ((x == y) && (i == j))
-				ch_A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
-		}
-	}
-
-
-
-	for (y = 0; y < nblocks; y++)
-	for (x = 0; x < nblocks; x++)
-	{
-		if (x <= y) {
-			starpu_matrix_data_register(&ch_A_state[y][x], 0, (uintptr_t)ch_A[y][x], 
-				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
-		}
-	}
-
-	return cholesky_no_stride(sched_ctx);
-
-	//	starpu_shutdown();
-	//	return 0;
-}

+ 0 - 231
examples/cholesky_2ctxs/cholesky_2ctxs.c

@@ -1,231 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011  INRIA
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "cholesky/cholesky.h"
-#include <pthread.h>
-
-typedef struct {
-  int start;
-  int argc;
-  char **argv;
-  unsigned ctx;
-  int the_other_ctx;
-  int *procs;
-  int nprocs;
-} params;
-
-typedef struct {
-  double flops;
-  double avg_timing;
-} retvals;
-
-#define NSAMPLES 3
-int first = 1;
-pthread_mutex_t mut;
-
-pthread_barrier_t barrier;
-
-void* func_cholesky(void *val){
-  params *p = (params*)val;
-  unsigned *sched_ctx = &p->ctx;
-  int the_other_ctx = p->the_other_ctx;
-
-  int i;
-  retvals *rv  = (retvals*)malloc(sizeof(retvals));
-  rv->flops = 0;
-  rv->avg_timing = 0;
-  double timing = 0;
-
-  starpu_set_sched_ctx(sched_ctx);
-  for(i = 0; i < NSAMPLES; i++)
-    {
-      rv->flops += run_cholesky_implicit(p->start, p->argc, p->argv, &timing, &barrier);
-      rv->avg_timing += timing;
-
-    }
-
-
-  pthread_mutex_lock(&mut);
-  if(first){
-      starpu_delete_sched_ctx(p->ctx, the_other_ctx);
-  }
-
-  first = 0;
-  pthread_mutex_unlock(&mut);
- 
-
-  rv->flops /= NSAMPLES;
-  rv->avg_timing /= NSAMPLES;
-  return (void*)rv;
-}
-
-void cholesky_vs_cholesky(params *p1, params *p2, params *p3, 
-			  unsigned cpu1, unsigned cpu2,
-			  unsigned gpu, unsigned gpu1, unsigned gpu2){
-
-  int nprocs1 = cpu1 + gpu + gpu1;
-  int nprocs2 = cpu2 + gpu + gpu2;
-  unsigned n_all_gpus = gpu + gpu1 + gpu2;
-
-  /* 2 cholesky in different ctxs */
-  starpu_init(NULL);
-  starpu_helper_cublas_init();
-
-  int procs[nprocs1];
-  int i;
-  int k = 0;
-
-  for(i = 0; i < gpu; i++)
-    {
-      procs[k++] = i;
-      //      printf("%d ", i);
-    }
-
-  for(i = gpu; i < gpu + gpu1; i++)
-    {
-      procs[k++] = i;
-      //printf("%d ", i);
-    }
-
-  for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
-    {
-      procs[k++] = i;
-      //printf("%d ", i);
-    }
-  //printf("\n");
-
-
-  p1->ctx = starpu_create_sched_ctx("heft", procs, nprocs1, "cholesky1");
-  p2->the_other_ctx = (int)p1->ctx;
-  p1->procs = procs;
-  p1->nprocs = nprocs1;
-  int procs2[nprocs2];
-
-  k = 0;
-
-  for(i = 0; i < gpu; i++){
-    procs2[k++] = i;
-    //    printf("%d ", i);
-  }
-
-  for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++){
-    procs2[k++] = i;
-    //    printf("%d ", i);
-  }
-
-  for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++){
-    procs2[k++] = i;
-    //    printf("%d ", i);
-  }
-
-  //  printf("\n");
-
-  p2->ctx = starpu_create_sched_ctx("prio", procs2, nprocs2, "cholesky2");
-  p1->the_other_ctx = (int)p2->ctx;
-  p2->procs = procs2;
-  p2->nprocs = nprocs2;
-
-  pthread_t tid[2];
-  pthread_barrier_init(&barrier, NULL, 2);
-  pthread_mutex_init(&mut, NULL);
-
-  struct timeval start;
-  struct timeval end;
-
-  gettimeofday(&start, NULL);
-
-
-  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p1);
-  pthread_create(&tid[1], NULL, (void*)func_cholesky, (void*)p2);
-
-  void *gflops_cholesky1;
-  void *gflops_cholesky2;
- 
-  pthread_join(tid[0], &gflops_cholesky1);
-  pthread_join(tid[1], &gflops_cholesky2);
-
-  gettimeofday(&end, NULL);
-
-  pthread_mutex_destroy(&mut);
-  starpu_helper_cublas_shutdown();
-  starpu_shutdown();
-  
-  double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-  timing /= 1000000;
-  //  timing /= 60;
-
-  printf("%2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, ((retvals*)gflops_cholesky2)->flops);
-  printf("%2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, ((retvals*)gflops_cholesky2)->avg_timing, timing);
-  /* printf("%2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, 0.0 );     */
-  /*  printf("%2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, 0.0, timing); */
-
-}
-
-int main(int argc, char **argv)
-{
-  unsigned cpu1 = 0, cpu2 = 0;
-
-  unsigned gpu = 0, gpu1 = 0, gpu2 = 0;
-  int i;
-  
-  for (i = 9; i < argc; i++) {
-
-    if (strcmp(argv[i], "-cpu1") == 0) {
-      char *argptr;
-      cpu1 = strtol(argv[++i], &argptr, 10);
-    }    
-
-    if (strcmp(argv[i], "-cpu2") == 0) {
-      char *argptr;
-      cpu2 = strtol(argv[++i], &argptr, 10);
-    }    
-
-    if (strcmp(argv[i], "-gpu") == 0) {
-      char *argptr;
-      gpu = strtol(argv[++i], &argptr, 10);
-    }    
-
-    if (strcmp(argv[i], "-gpu1") == 0) {
-      char *argptr;
-      gpu1 = strtol(argv[++i], &argptr, 10);
-    }    
-
-    if (strcmp(argv[i], "-gpu2") == 0) {
-      char *argptr;
-      gpu2 = strtol(argv[++i], &argptr, 10);
-    }    
-
-
-  }
-
-  params p1;
-  p1.start = 1;
-  p1.argc = 5;
-  p1.argv = argv;
-
-  params p2;
-  p2.start = 5;
-  p2.argc = 9;
-  p2.argv = argv;
-
-  params p3;
-  p3.argc = argc;
-  p3.argv = argv;
-  p3.ctx = 0;
-  cholesky_vs_cholesky(&p1, &p2,&p3, cpu1, cpu2, gpu, gpu1, gpu2);
-
-  return 0;
-}

+ 5 - 5
examples/common/blas_model.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,15 +27,15 @@
  *	Number of flops of Gemm 
  */
 
-double gemm_cost(starpu_buffer_descr *descr)
+double gemm_cost(struct starpu_task *task, unsigned nimpl)
 {
 	/* C = A * B */
 	uint32_t nxC, nyC, nxA;
 
 
-	nxC = starpu_matrix_get_nx(descr[2].handle);
-	nyC = starpu_matrix_get_ny(descr[2].handle);
-	nxA = starpu_matrix_get_nx(descr[0].handle);
+	nxC = starpu_matrix_get_nx(task->descr[2].handle);
+	nyC = starpu_matrix_get_ny(task->descr[2].handle);
+	nxA = starpu_matrix_get_nx(task->descr[0].handle);
 
 /*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
 

+ 13 - 9
examples/common/blas_model.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,9 +20,10 @@
 
 #include <starpu.h>
 
-double gemm_cost(starpu_buffer_descr *descr);
+double gemm_cost(struct starpu_task *task, unsigned nimpl);
 
-static struct starpu_perfmodel_t starpu_sgemm_model = {
+static struct starpu_perfmodel starpu_sgemm_model =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = "sgemm_atlas"
@@ -33,12 +34,14 @@ static struct starpu_perfmodel_t starpu_sgemm_model = {
 #endif
 };
 
-static struct starpu_perfmodel_t starpu_sgemm_model_common = {
-	.cost_model = gemm_cost,
+static struct starpu_perfmodel starpu_sgemm_model_common =
+{
+	.cost_function = gemm_cost,
 	.type = STARPU_COMMON,
 };
 
-static struct starpu_perfmodel_t starpu_dgemm_model = {
+static struct starpu_perfmodel starpu_dgemm_model =
+{
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = "dgemm_atlas"
@@ -49,8 +52,9 @@ static struct starpu_perfmodel_t starpu_dgemm_model = {
 #endif
 };
 
-static struct starpu_perfmodel_t starpu_dgemm_model_common = {
-	.cost_model = gemm_cost,
+static struct starpu_perfmodel starpu_dgemm_model_common =
+{
+	.cost_function = gemm_cost,
 	.type = STARPU_COMMON,
 };
 

+ 101 - 0
examples/cpp/incrementer_cpp.cpp

@@ -0,0 +1,101 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+#warning todo: fix cuda and opencl
+//#ifdef STARPU_USE_CUDA
+//extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+//#endif
+
+//#ifdef STARPU_USE_OPENCL
+//#include <starpu_opencl.h>
+//extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
+//struct starpu_opencl_program opencl_program;
+//#endif
+
+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+
+	val[0] += 1.0f; val[1] += 1.0f;
+}
+
+int main(int argc, char **argv)
+{
+	int ret = 0;
+	starpu_data_handle_t float_array_handle;
+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
+        struct starpu_codelet cl;
+	unsigned i;
+	unsigned niter = 50;
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
+
+//#ifdef STARPU_USE_OPENCL
+//        ret = starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
+//	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
+//#endif
+
+        starpu_codelet_init(&cl);
+        cl.where = STARPU_CPU;//|STARPU_CUDA;//|STARPU_OPENCL,
+        cl.cpu_funcs[0] = cpu_codelet;
+//#ifdef STARPU_USE_CUDA
+//        cl.cuda_funcs[0] = cuda_codelet;
+//#endif
+//#ifdef STARPU_USE_OPENCL
+//	cl.opencl_funcs[0] = opencl_codelet;
+//#endif
+        cl.nbuffers = 1;
+        cl.modes[0] = STARPU_RW;
+
+	for (i = 0; i < niter; i++)
+	{
+		ret = starpu_insert_task(&cl,
+					 STARPU_RW, float_array_handle,
+					 0);
+                if (STARPU_UNLIKELY(ret == -ENODEV))
+                {
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(77);
+                }
+        }
+
+	starpu_task_wait_for_all();
+
+	/* update the array in RAM */
+	starpu_data_unregister(float_array_handle);
+
+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
+                float_array[1], float_array[2], float_array[3]);
+
+	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
+	{
+		FPRINTF(stderr, "Incorrect result\n");
+		ret = 1;
+	}
+
+	starpu_shutdown();
+
+	return ret;
+}

+ 51 - 0
examples/filters/custom_mf/conversion.cu

@@ -0,0 +1,51 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+static __global__ void custom_cuda(struct point *aop,
+				unsigned n,
+				float *x,
+				float *y)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+	{
+		x[i] = aop[i].x;
+		y[i] = aop[i].y;
+	}
+}
+
+extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
+{
+	(void) _args;
+
+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
+
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+        custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 102 - 0
examples/filters/custom_mf/conversion_opencl.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+extern struct starpu_opencl_program opencl_conversion_program;
+
+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = CUSTOM_GET_NX(buffers[0]);
+	n*=2;
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_conversion_program,
+					"custom_opencl_conversion",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	if (starpu_opencl_set_kernel_args(&err, &kernel,
+					  sizeof(aop), &aop,
+					  sizeof(x), &x,
+					  sizeof(n), &n,
+					  0) != 3)
+	{
+		STARPU_OPENCL_REPORT_ERROR(err);
+		assert(0);
+	}
+	
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(
+				queue,
+				kernel,
+				1,       /* work_dim */
+				NULL,    /* global_work_offset */
+				&global, /* global_work_size */
+				&local,  /* local_work_size */
+				0,       /* num_events_in_wait_list */
+				NULL,    /* event_wait_list */
+				&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 17 - 4
examples/starpufft/starpufft.c

@@ -1,7 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,5 +14,19 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "double.h"
-#include "starpufftx.c"
+#include "custom_types.h"
+
+/*
+ * The first n/2 values of x are actual xs. The last N/2 values are ys.
+ */
+__kernel void custom_opencl_conversion(__global struct point *aop,
+				       __global float *x,
+				       int nx)
+{
+        const int i = get_global_id(0);
+	if (i < nx/2)
+		x[i] = aop[i].x;
+	else if (i < nx)
+		x[i] = aop[i-nx/2].y;
+
+}

+ 45 - 0
examples/filters/custom_mf/cuda.cu

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+static __global__ void scal_cuda(unsigned n,
+				 float *x,
+				 float *y)
+{
+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
+
+	if (i < n)
+		x[i] *= y[i];
+}
+
+extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
+{
+	(void) _args;
+
+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
+
+	unsigned threads_per_block = 64;
+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
+        scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}

+ 95 - 0
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -0,0 +1,95 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 INRIA
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include "custom_interface.h"
+#include "custom_types.h"
+
+#ifdef STARPU_USE_CUDA
+void cuda_to_cpu(void *buffers[], void *arg)
+{
+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		aop[i].x = x[i];
+		aop[i].y = y[i];
+	}
+	return;
+}
+
+extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
+struct starpu_codelet cpu_to_cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_cuda"
+};
+
+struct starpu_codelet cuda_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {cuda_to_cpu, NULL},
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cuda_to_cpu"
+};
+#endif
+
+
+#ifdef STARPU_USE_OPENCL
+void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
+{
+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
+	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	int i;
+	for (i = 0; i < n; i++)
+	{
+		aop[i].x = x[i];
+		aop[i].y = x[i+n];
+	}
+}
+
+extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
+
+struct starpu_codelet cpu_to_opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_cpu_to_opencl"
+};
+
+struct starpu_codelet opencl_to_cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
+	.modes = { STARPU_RW },
+	.nbuffers = 1,
+	.name = "codelet_opencl_to_cpu"
+};
+#endif /* !STARPU_USE_OPENCL */

+ 599 - 0
examples/filters/custom_mf/custom_interface.c

@@ -0,0 +1,599 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include <starpu_hash.h>
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif
+#include "custom_interface.h"
+#include "custom_types.h"
+
+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node);
+#ifdef STARPU_USE_CUDA
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
+			    void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
+			    void *dst_interface, unsigned dst_node);
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  cudaStream_t stream);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  cudaStream_t stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
+			     void *dst_interface, unsigned dst_node);
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
+				   void *dst_interface, unsigned dst_node,
+				   cudaStream_t stream);
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node);
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node);
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event);
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event);
+#endif /* !STARPU_USE_OPENCL */
+
+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
+{
+	.ram_to_ram = copy_ram_to_ram,
+	.ram_to_spu = NULL,
+#ifdef STARPU_USE_CUDA
+	.ram_to_cuda        = copy_ram_to_cuda,
+	.cuda_to_ram        = copy_cuda_to_ram,
+	.ram_to_cuda_async  = copy_ram_to_cuda_async,
+	.cuda_to_ram_async  = copy_cuda_to_ram_async,
+	.cuda_to_cuda       = copy_cuda_to_cuda,
+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.ram_to_opencl       = copy_ram_to_opencl,
+	.opencl_to_ram       = copy_opencl_to_ram,
+	.opencl_to_opencl    = copy_opencl_to_opencl,
+        .ram_to_opencl_async = copy_ram_to_opencl_async,
+	.opencl_to_ram_async = copy_opencl_to_ram_async,
+#endif
+	.cuda_to_spu = NULL,
+	.spu_to_ram  = NULL,
+	.spu_to_cuda = NULL,
+	.spu_to_spu  = NULL
+};
+
+static void     register_custom_handle(starpu_data_handle_t handle,
+				       uint32_t home_node,
+				       void *data_interface);
+static ssize_t  allocate_custom_buffer_on_node(void *data_interface_,
+					       uint32_t dst_node);
+static void*    custom_handle_to_pointer(starpu_data_handle_t data_handle,
+					 uint32_t node);
+static void     free_custom_buffer_on_node(void *data_interface, uint32_t node);
+static size_t   custom_interface_get_size(starpu_data_handle_t handle);
+static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle);
+static int      custom_compare(void *data_interface_a, void *data_interface_b);
+static void     display_custom_interface(starpu_data_handle_t handle, FILE *f);
+static uint32_t custom_get_nx(starpu_data_handle_t handle);
+
+
+static struct starpu_multiformat_data_interface_ops*get_mf_ops(void *data_interface)
+{
+	struct custom_data_interface *custom;
+	custom = (struct custom_data_interface *) data_interface;
+
+	return custom->ops;
+}
+
+static struct starpu_data_interface_ops interface_custom_ops =
+{
+	.register_data_handle  = register_custom_handle,
+	.allocate_data_on_node = allocate_custom_buffer_on_node,
+	.handle_to_pointer     = custom_handle_to_pointer,
+	.free_data_on_node     = free_custom_buffer_on_node,
+	.copy_methods          = &custom_copy_data_methods_s,
+	.get_size              = custom_interface_get_size,
+	.footprint             = footprint_custom_interface_crc32,
+	.compare               = custom_compare,
+#ifdef STARPU_USE_GORDON
+	.convert_to_gordon     = NULL,
+#endif
+	.interfaceid           = -1,
+	.interface_size        = sizeof(struct custom_data_interface),
+	.display               = display_custom_interface,
+	.is_multiformat        = 1,
+	.get_mf_ops            = get_mf_ops
+};
+
+static void
+register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
+{
+	struct custom_data_interface *custom_interface;
+	custom_interface = (struct custom_data_interface *) data_interface;
+
+	unsigned node;
+	unsigned nnodes = starpu_memory_nodes_get_count();
+	for (node = 0; node < nnodes; node++)
+	{
+		struct custom_data_interface *local_interface =
+			(struct custom_data_interface *) starpu_data_get_interface_on_node(handle, node);
+
+		if (node == home_node)
+		{
+			local_interface->cpu_ptr    = custom_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = custom_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = custom_interface->opencl_ptr;
+#endif
+		}
+		else
+		{
+			local_interface->cpu_ptr    = NULL;
+#ifdef STARPU_USE_CUDA
+			local_interface->cuda_ptr   = NULL;
+#endif
+#ifdef STARPU_USE_OPENCL
+			local_interface->opencl_ptr = NULL;
+#endif
+		}
+		local_interface->nx = custom_interface->nx;
+		local_interface->ops = custom_interface->ops;
+	}
+}
+
+static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t node)
+{
+	ssize_t size = 0;
+	struct custom_data_interface *custom_interface;
+	custom_interface = (struct custom_data_interface *) data_interface;
+
+	switch(starpu_node_get_kind(node))
+	{
+	case STARPU_CPU_RAM:
+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+		custom_interface->cpu_ptr = (void*) malloc(size);
+		if (!custom_interface->cpu_ptr)
+			return -ENOMEM;
+#ifdef STARPU_USE_CUDA
+		custom_interface->cuda_ptr = (void *) malloc(size);
+		if (!custom_interface->cuda_ptr)
+		{
+			free(custom_interface->cpu_ptr);
+			custom_interface->cpu_ptr = NULL;
+			return -ENOMEM;
+		}
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+		custom_interface->opencl_ptr = malloc(size);
+		if (custom_interface->cuda_ptr == NULL)
+		{
+			free(custom_interface->cpu_ptr);
+#ifdef STARPU_USE_CUDA
+			free(custom_interface->cuda_ptr);
+#endif /* !STARPU_USE_CUDA */
+			return -ENOMEM;
+		}
+#endif /* !STARPU_USE_OPENCL */
+			
+		break;
+#if STARPU_USE_CUDA
+	case STARPU_CUDA_RAM:
+	{
+		cudaError_t err;
+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+		err = cudaMalloc(&custom_interface->cuda_ptr, size);
+		if (err != cudaSuccess)
+			return -ENOMEM;
+
+		err = cudaMalloc(&custom_interface->cpu_ptr, size);
+		if (err != cudaSuccess)
+		{
+			cudaFree(custom_interface->cuda_ptr);
+			return -ENOMEM;
+		}
+		break;
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	case STARPU_OPENCL_RAM:
+	{
+		cl_int err;
+		cl_mem memory;
+		ssize_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
+		err = starpu_opencl_allocate_memory(&memory, size, CL_MEM_READ_WRITE);
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+		custom_interface->opencl_ptr = memory;
+
+		break;
+	}
+#endif /* !STARPU_USE_OPENCL */
+	default:
+		assert(0);
+	}
+
+	/* XXX We may want to return cpu_size + cuda_size + ... */
+	return size;
+}
+
+static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
+{
+	struct custom_data_interface *custom_interface;
+	custom_interface = (struct custom_data_interface *) data_interface;
+
+	switch(starpu_node_get_kind(node))
+	{
+	case STARPU_CPU_RAM:
+		if (custom_interface->cpu_ptr != NULL)
+		{
+			free(custom_interface->cpu_ptr);
+			custom_interface->cpu_ptr = NULL;
+		}
+#ifdef STARPU_USE_CUDA
+		if (custom_interface->cuda_ptr != NULL)
+		{
+			free(custom_interface->cuda_ptr);
+			custom_interface->cuda_ptr = NULL;
+		}
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+		if (custom_interface->opencl_ptr != NULL)
+		{
+			free(custom_interface->opencl_ptr);
+			custom_interface->opencl_ptr = NULL;
+		}
+#endif /* !STARPU_USE_OPENCL */
+		break;
+#ifdef STARPU_USE_CUDA
+	case STARPU_CUDA_RAM:
+		if (custom_interface->cpu_ptr != NULL)
+		{
+			cudaError_t err;
+			err = cudaFree(custom_interface->cpu_ptr);
+			if (err != cudaSuccess)
+				fprintf(stderr, "cudaFree failed...\n");
+		}
+		if (custom_interface->cuda_ptr != NULL)
+		{
+			cudaError_t err;
+			err = cudaFree(custom_interface->cuda_ptr);
+			if (err != cudaSuccess)
+				fprintf(stderr, "cudaFree failed...\n");
+		}
+		break;
+#endif /* !STARPU_USE_CUDA */
+	default:
+		assert(0);
+	}
+}
+
+static void*
+custom_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
+{
+	struct custom_data_interface *data_interface =
+		(struct custom_data_interface *) starpu_data_get_interface_on_node(handle, node);
+
+
+	switch(starpu_node_get_kind(node))
+	{
+		case STARPU_CPU_RAM:
+			return data_interface->cpu_ptr;
+#ifdef STARPU_USE_CUDA
+		case STARPU_CUDA_RAM:
+			return data_interface->cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+		case STARPU_OPENCL_RAM:
+			return data_interface->opencl_ptr;
+#endif
+		default:
+			assert(0);
+	}
+}
+
+static size_t custom_interface_get_size(starpu_data_handle_t handle)
+{
+	size_t size;
+	struct custom_data_interface *data_interface;
+
+	data_interface = (struct custom_data_interface *)
+				starpu_data_get_interface_on_node(handle, 0);
+	size = data_interface->nx * data_interface->ops->cpu_elemsize;
+	return size;
+}
+
+static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
+{
+	return starpu_crc32_be(custom_get_nx(handle), 0);
+}
+
+static int custom_compare(void *data_interface_a, void *data_interface_b)
+{
+	/* TODO */
+	assert(0);
+}
+
+static void display_custom_interface(starpu_data_handle_t handle, FILE *f)
+{
+	/* TODO */
+	assert(0);
+}
+
+static uint32_t
+custom_get_nx(starpu_data_handle_t handle)
+{
+	struct custom_data_interface *data_interface;
+	data_interface = (struct custom_data_interface *)
+				starpu_data_get_interface_on_node(handle, 0);
+	return data_interface->nx;
+}
+
+
+void custom_data_register(starpu_data_handle_t *handle,
+				 uint32_t home_node,
+				 void *ptr,
+				 uint32_t nx,
+				 struct starpu_multiformat_data_interface_ops *format_ops)
+{
+	/* XXX Deprecated fields ? */
+	struct custom_data_interface custom =
+	{
+		.cpu_ptr = ptr,
+#ifdef STARPU_USE_CUDA
+		.cuda_ptr = NULL,
+#endif
+#ifdef STARPU_USE_OPENCL
+		.opencl_ptr = NULL,
+#endif
+		.nx  = nx,
+		.ops = format_ops
+	};
+
+	if (interface_custom_ops.interfaceid == -1) {
+		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
+	}
+	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
+}
+
+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node)
+{
+	/* TODO */
+	assert(0);
+}
+#ifdef STARPU_USE_CUDA
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
+			    void *dst_interface, unsigned dst_node)
+{
+	/* TODO */
+	assert(0);
+}
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
+			    void *dst_interface, unsigned dst_node)
+{
+	/* TODO */
+	assert(0);
+}
+
+static int
+copy_cuda_common_async(void *src_interface, unsigned src_node,
+		       void *dst_interface, unsigned dst_node,
+		       cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+	struct custom_data_interface *src_custom, *dst_custom;
+
+	src_custom = (struct custom_data_interface *) src_interface;
+	dst_custom = (struct custom_data_interface *) dst_interface;
+
+	ssize_t size = 0;
+	cudaError_t err;
+
+	switch (kind)
+	{
+	case cudaMemcpyHostToDevice:
+	{
+		size = src_custom->nx * src_custom->ops->cpu_elemsize;
+		if (dst_custom->cpu_ptr == NULL)
+		{
+			err = cudaMalloc(&dst_custom->cpu_ptr, size);
+			assert(err == cudaSuccess);
+		}
+
+		err = cudaMemcpyAsync(dst_custom->cpu_ptr,
+				      src_custom->cpu_ptr,
+				      size, kind, stream);
+		assert(err == cudaSuccess);
+
+
+		err = cudaMalloc(&dst_custom->cuda_ptr, size);
+		assert(err == cudaSuccess);
+		break;
+	}
+	case cudaMemcpyDeviceToHost:
+		size = 2*src_custom->nx*sizeof(float);
+		if (dst_custom->cuda_ptr == NULL)
+		{
+			dst_custom->cuda_ptr = malloc(size);
+			if (dst_custom->cuda_ptr == NULL)
+				return -ENOMEM;
+		}
+		err = cudaMemcpyAsync(dst_custom->cuda_ptr,
+				      src_custom->cuda_ptr,
+				      size, kind, stream);
+		assert(err == cudaSuccess);
+		break;
+	default:
+		assert(0);
+	}
+
+	return 0;
+}
+
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node,
+				      dst_interface, dst_node,
+				      stream, cudaMemcpyHostToDevice);
+}
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  cudaStream_t stream)
+{
+	return copy_cuda_common_async(src_interface, src_node,
+				      dst_interface, dst_node,
+				      stream, cudaMemcpyDeviceToHost);
+}
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
+			     void *dst_interface, unsigned dst_node)
+{
+	assert(0);
+}
+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
+				   void *dst_interface, unsigned dst_node,
+				   cudaStream_t stream)
+{
+	assert(0);
+}
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
+			      void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node)
+{
+	(void) src_interface;
+	(void) src_node;
+	(void) dst_interface;
+	(void) dst_node;
+	return 0;
+}
+
+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event)
+{
+	ssize_t size;
+	struct custom_data_interface *src_custom, *dst_custom;
+
+	src_custom = (struct custom_data_interface *) src_interface;
+	dst_custom = (struct custom_data_interface *) dst_interface;
+
+	/*
+	 * Opencl stuff.
+	 */
+	cl_context context;
+	cl_command_queue queue;
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_queue(devid, &queue);
+	starpu_opencl_get_context(devid, &context);
+
+	/* Real stuff */
+	int err;
+	cl_int ret;
+
+	size = src_custom->nx * 2 * sizeof(float);
+	if (dst_custom->cpu_ptr == NULL)
+	{
+		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
+				size, CL_MEM_READ_WRITE);
+		assert(ret == CL_SUCCESS);
+	}
+	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
+							  src_node,
+							  dst_custom->cpu_ptr,
+							  dst_node,
+							  size,
+							  0,
+							  NULL,
+							  &ret);
+	assert(err == 0);
+	return 0;
+}
+
+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    void *event)
+{
+	ssize_t size;
+	struct custom_data_interface *src_custom, *dst_custom;
+
+	src_custom = (struct custom_data_interface *) src_interface;
+	dst_custom = (struct custom_data_interface *) dst_interface;
+
+	/*
+	 * Opencl stuff.
+	 */
+	cl_context context;
+	cl_command_queue queue;
+	int id = starpu_worker_get_id();
+	int devid = starpu_worker_get_devid(id);
+	starpu_opencl_get_queue(devid, &queue);
+	starpu_opencl_get_context(devid, &context);
+
+	/* real stuff */
+	int err;
+	cl_int ret;
+	size = src_custom->nx * 2 * sizeof(float);
+	if (!dst_custom->opencl_ptr)
+	{
+		dst_custom->opencl_ptr = malloc(size);
+		assert(dst_custom->opencl_ptr != NULL);
+	}
+
+	err = starpu_opencl_copy_opencl_to_ram_async_sync(
+			src_custom->opencl_ptr,
+			src_node,
+			dst_custom->opencl_ptr,
+			dst_node,
+			size,
+			0,
+			NULL,
+			&ret);
+	assert(err == 0);
+	return 0;
+}
+#endif /* !STARPU_USE_OPENCL */

+ 48 - 0
examples/filters/custom_mf/custom_interface.h

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#ifndef __CUSTOM_INTERFACE_H__
+#define __CUSTOM_INTERFACE_H__
+#include <starpu.h>
+struct custom_data_interface
+{
+	void *cpu_ptr;
+	void *cuda_ptr;
+	void *opencl_ptr;
+	struct starpu_multiformat_data_interface_ops *ops;
+	uint32_t nx;
+};
+
+void custom_data_register(starpu_data_handle_t *handle,
+				 uint32_t home_node,
+				 void *ptr,
+				 uint32_t nx,
+				 struct starpu_multiformat_data_interface_ops* ops);
+
+#define CUSTOM_GET_NX(interface) (((struct custom_data_interface*)(interface))->nx)
+#define CUSTOM_GET_CPU_PTR(interface) (((struct custom_data_interface*)(interface))->cpu_ptr)
+
+#ifdef STARPU_USE_CUDA
+#define CUSTOM_GET_X_PTR(interface) (((struct custom_data_interface*)(interface))->cuda_ptr)
+#define CUSTOM_GET_Y_PTR(interface) \
+	(((struct custom_data_interface*)(interface))->cuda_ptr)+ \
+	CUSTOM_GET_NX((interface))
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+#define CUSTOM_GET_OPENCL_X_PTR(interface) (((struct custom_data_interface *)(interface))->opencl_ptr)
+#endif
+
+#endif /* ! __CUSTOM_INTERFACE_H__ */

+ 331 - 0
examples/filters/custom_mf/custom_mf_filter.c

@@ -0,0 +1,331 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+#include <starpu.h>
+#include "custom_interface.h"
+#include "custom_types.h"
+#ifdef STARPU_USE_OPENCL
+#include <starpu_opencl.h>
+#endif /* !STARPU_USE_OPENCL */
+
+#define N 12
+
+#define DEBUG 1
+
+#ifdef STARPU_USE_CUDA
+static unsigned int ncuda;
+#endif
+#ifdef STARPU_USE_OPENCL
+static unsigned int nopencl;
+#endif
+
+
+static struct point array_of_structs[N];
+static starpu_data_handle_t handle;
+static unsigned int nchunks = 6;
+
+#ifdef STARPU_USE_CUDA
+extern struct starpu_codelet cpu_to_cuda_cl;
+extern struct starpu_codelet cuda_to_cpu_cl;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+extern struct starpu_codelet cpu_to_opencl_cl;
+extern struct starpu_codelet opencl_to_cpu_cl;
+#endif
+
+static struct starpu_multiformat_data_interface_ops format_ops =
+{
+#ifdef STARPU_USE_CUDA
+	.cuda_elemsize = sizeof(struct struct_of_arrays),
+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
+#endif
+#ifdef STARPU_USE_OPENCL
+	.opencl_elemsize  = sizeof(struct struct_of_arrays),
+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
+#endif
+	.cpu_elemsize = sizeof(struct point),
+};
+
+
+static void
+custom_filter(void *father, void *child, struct starpu_data_filter *f,
+		unsigned id, unsigned nchunks)
+{
+	struct custom_data_interface *custom_father, *custom_child;
+	custom_father = (struct custom_data_interface *) father;
+	custom_child = (struct custom_data_interface *) child;
+
+	assert(N % nchunks == 0); // XXX 
+	ssize_t chunk_size = N/nchunks;
+
+	if (custom_father->cpu_ptr)
+	{
+		struct point *tmp = (struct point *) custom_father->cpu_ptr;
+		tmp += id * chunk_size;
+		custom_child->cpu_ptr = tmp;
+	}
+#ifdef STARPU_USE_CUDA
+	else if (custom_father->cuda_ptr)
+	{
+		struct struct_of_arrays *soa_father, *soa_child;
+		soa_father = (struct struct_of_arrays*) custom_father->cuda_ptr;
+		soa_child = (struct struct_of_arrays*) custom_child->cuda_ptr;
+		soa_child->x = soa_father->x + chunk_size;
+		soa_child->y = soa_father->y + chunk_size;
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	else if (custom_father->opencl_ptr)
+	{
+		struct struct_of_arrays *soa_father, *soa_child;
+		soa_father = (struct struct_of_arrays*) custom_father->opencl_ptr;
+		soa_child = (struct struct_of_arrays*) custom_child->opencl_ptr;
+		soa_child->x = soa_father->x + chunk_size;
+		soa_child->y = soa_father->y + chunk_size;
+	}
+#endif /* !STARPU_USE_OPENCL */
+
+	custom_child->ops = custom_father->ops;
+	custom_child->nx = chunk_size;
+}
+
+static void
+register_and_partition_data(void)
+{
+	int i;
+	for (i = 0; i < N; i++)
+	{
+		array_of_structs[i].x = i+1.0;
+		array_of_structs[i].y = 42.0;
+	}
+	custom_data_register(&handle, 0, &array_of_structs, N, &format_ops);
+
+	struct starpu_data_filter f =
+	{
+		.filter_func   = custom_filter,
+		.nchildren     = nchunks,
+		.get_nchildren = NULL,
+		.get_child_ops = NULL
+	};
+	starpu_data_partition(handle, &f);
+}
+
+static void
+unpartition_and_unregister_data(void)
+{
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unregister(handle);
+}
+
+static void
+custom_scal_cpu_func(void *buffers[], void *args)
+{
+	struct point *aos;
+	unsigned int n, i;
+
+	aos = CUSTOM_GET_CPU_PTR(buffers[0]);
+	n = CUSTOM_GET_NX(buffers[0]);
+
+	for (i = 0; i < n; i++)
+		aos[i].x *= aos[i].y;
+}
+
+#ifdef STARPU_USE_CUDA
+extern void custom_scal_cuda_func(void *buffers[], void *args);
+#endif
+
+static struct starpu_codelet cpu_cl =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = { custom_scal_cpu_func, NULL},
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "codelet_real"
+};
+
+#ifdef STARPU_USE_CUDA
+static struct starpu_codelet cuda_cl =
+{
+	.where = STARPU_CUDA,
+	.cuda_funcs = { custom_scal_cuda_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "cuda_codelet"
+};
+#endif /* !STARPU_USE_CUDA */
+
+#ifdef STARPU_USE_OPENCL
+extern void custom_scal_opencl_func(void *buffers[], void *args);
+
+static struct starpu_codelet opencl_cl =
+{
+	.where = STARPU_OPENCL,
+	.opencl_funcs = { custom_scal_opencl_func, NULL },
+	.nbuffers = 1,
+	.modes = { STARPU_RW },
+	.name = "opencl_codelet"
+};
+#endif /* !STARPU_USE_OPENCL */
+
+static int
+create_and_submit_tasks(void)
+{
+	int err;
+	unsigned int i;
+	for (i = 0; i < nchunks; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		switch (i%3)
+		{
+		case 0:
+			task->cl = &cpu_cl;
+			break;
+		case 1:
+#ifdef STARPU_USE_CUDA
+			if (ncuda > 0)
+				task->cl = &cuda_cl;
+			else
+#endif
+				task->cl = &cpu_cl;
+			break;
+		case 2:
+#ifdef STARPU_USE_OPENCL
+			if (nopencl > 0)
+				task->cl = &opencl_cl;
+			else
+#endif
+				task->cl = &cpu_cl;
+			break;
+		default:
+			/* We should never get here */
+			assert(0);
+		}
+
+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+		err = starpu_task_submit(task);
+		if (err != 0)
+			return err;
+	}
+
+
+	err = starpu_task_wait_for_all();
+	if (err != 0)
+		return err;
+
+	return 0;
+}
+
+#if DEBUG
+static void
+print_it(void)
+{
+	int i;
+	for (i = 0; i < N; i++)
+	{
+		FPRINTF(stderr, "(%.2f, %.2f) ",
+			array_of_structs[i].x,
+			array_of_structs[i].y);
+	}
+	FPRINTF(stderr, "\n");
+}
+#endif
+
+static int
+check_it(void)
+{
+	int i;
+	for (i = 0; i < N; i++)
+	{
+		float expected_value = (i + 1.0)*42.0;
+		if (array_of_structs[i].x != expected_value)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}
+
+#ifdef STARPU_USE_OPENCL
+struct starpu_opencl_program opencl_program;
+struct starpu_opencl_program opencl_conversion_program;
+#endif /* !STARPU_USE_OPENCL */
+
+int
+main(void)
+{
+#ifndef STARPU_USE_CPU
+	return 77;
+#else
+	int err;
+
+	err = starpu_init(NULL);
+	if (err == -ENODEV)
+		goto enodev;
+
+#ifdef STARPU_USE_CUDA
+	ncuda = starpu_cuda_worker_get_count();
+#endif /* !STARPU_USE_CUDA */
+#ifdef STARPU_USE_OPENCL
+	nopencl = starpu_opencl_worker_get_count();
+	if (nopencl > 0)
+	{
+		char *f1 = "examples/filters/custom_mf/custom_opencl.cl";
+		char *f2 = "examples/filters/custom_mf/conversion_opencl.cl";
+		err = starpu_opencl_load_opencl_from_file(f1, &opencl_program,
+							  NULL);
+		assert(err == 0);
+		err = starpu_opencl_load_opencl_from_file(f2,
+						&opencl_conversion_program,
+						NULL);
+		assert(err == 0);
+	}
+#endif /* !STARPU_USE_OPENCL */
+
+	register_and_partition_data();
+#if DEBUG
+	print_it();
+#endif
+	err = create_and_submit_tasks();
+	if (err != 0)
+	{
+		FPRINTF(stderr, "create_submit_task : %s\n",
+			strerror(-err));
+		return EXIT_FAILURE;
+	}
+	unpartition_and_unregister_data();
+#if DEBUG
+	print_it();
+#endif
+
+#if STARPU_USE_OPENCL
+	if (nopencl > 0)
+	{
+        	err = starpu_opencl_unload_opencl(&opencl_program);
+		assert(err == 0);
+		err = starpu_opencl_unload_opencl(&opencl_conversion_program);
+		assert(err == 0);
+	}
+#endif /* !STARPU_USE_OPENCL */
+	starpu_shutdown();		
+	print_it();
+	return check_it();
+
+
+enodev:
+	return 77;
+#endif
+}

+ 101 - 0
examples/filters/custom_mf/custom_opencl.c

@@ -0,0 +1,101 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_opencl.h>
+#include "custom_types.h"
+#include "custom_interface.h"
+
+extern struct starpu_opencl_program opencl_program;
+
+void custom_scal_opencl_func(void *buffers[], void *args)
+{
+	(void) args;
+	int id, devid;
+        cl_int err;
+	cl_kernel kernel;
+	cl_command_queue queue;
+	cl_event event;
+
+	unsigned n = CUSTOM_GET_NX(buffers[0]);
+	struct point *aop;
+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
+
+	id = starpu_worker_get_id();
+	devid = starpu_worker_get_devid(id);
+
+	err = starpu_opencl_load_kernel(&kernel,
+					&queue,
+					&opencl_program,
+					"custom_scal_opencl",
+					devid);
+	if (err != CL_SUCCESS)
+		STARPU_OPENCL_REPORT_ERROR(err);
+
+
+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
+	if (starpu_opencl_set_kernel_args(&err, &kernel,
+					  sizeof(aop), &aop,
+					  sizeof(x), &x,
+					  sizeof(n), &n,
+					  0) != 3)
+	{
+		STARPU_OPENCL_REPORT_ERROR(err);
+		assert(0);
+	}
+	
+
+	{
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
+
+                err = clGetKernelWorkGroupInfo (kernel,
+						device,
+						CL_KERNEL_WORK_GROUP_SIZE,
+						sizeof(local),
+						&local,
+						&s);
+                if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+
+                if (local > global)
+			local = global;
+
+		err = clEnqueueNDRangeKernel(
+				queue,
+				kernel,
+				1,       /* work_dim */
+				NULL,    /* global_work_offset */
+				&global, /* global_work_size */
+				&local,  /* local_work_size */
+				0,       /* num_events_in_wait_list */
+				NULL,    /* event_wait_list */
+				&event);
+
+		if (err != CL_SUCCESS)
+			STARPU_OPENCL_REPORT_ERROR(err);
+	}
+
+	clFinish(queue);
+	starpu_opencl_collect_stats(event);
+	clReleaseEvent(event);
+
+	starpu_opencl_release_kernel(kernel);
+}

+ 0 - 0
examples/starpufft/starpufftf.c


Some files were not shown because too many files changed in this diff