13 years ago · 94a438324b
--- a/doc/COPYING.GFDL
+++ b/doc/COPYING.GFDL
@@ -0,0 +1,451 @@
 
				+
			
 
				+                GNU Free Documentation License
			
 
				+                 Version 1.3, 3 November 2008
			
 
				+
			
 
				+
			
 
				+ Copyright (C) 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
			
 
				+     <http://fsf.org/>
			
 
				+ Everyone is permitted to copy and distribute verbatim copies
			
 
				+ of this license document, but changing it is not allowed.
			
 
				+
			
 
				+0. PREAMBLE
			
 
				+
			
 
				+The purpose of this License is to make a manual, textbook, or other
			
 
				+functional and useful document "free" in the sense of freedom: to
			
 
				+assure everyone the effective freedom to copy and redistribute it,
			
 
				+with or without modifying it, either commercially or noncommercially.
			
 
				+Secondarily, this License preserves for the author and publisher a way
			
 
				+to get credit for their work, while not being considered responsible
			
 
				+for modifications made by others.
			
 
				+
			
 
				+This License is a kind of "copyleft", which means that derivative
			
 
				+works of the document must themselves be free in the same sense.  It
			
 
				+complements the GNU General Public License, which is a copyleft
			
 
				+license designed for free software.
			
 
				+
			
 
				+We have designed this License in order to use it for manuals for free
			
 
				+software, because free software needs free documentation: a free
			
 
				+program should come with manuals providing the same freedoms that the
			
 
				+software does.  But this License is not limited to software manuals;
			
 
				+it can be used for any textual work, regardless of subject matter or
			
 
				+whether it is published as a printed book.  We recommend this License
			
 
				+principally for works whose purpose is instruction or reference.
			
 
				+
			
 
				+
			
 
				+1. APPLICABILITY AND DEFINITIONS
			
 
				+
			
 
				+This License applies to any manual or other work, in any medium, that
			
 
				+contains a notice placed by the copyright holder saying it can be
			
 
				+distributed under the terms of this License.  Such a notice grants a
			
 
				+world-wide, royalty-free license, unlimited in duration, to use that
			
 
				+work under the conditions stated herein.  The "Document", below,
			
 
				+refers to any such manual or work.  Any member of the public is a
			
 
				+licensee, and is addressed as "you".  You accept the license if you
			
 
				+copy, modify or distribute the work in a way requiring permission
			
 
				+under copyright law.
			
 
				+
			
 
				+A "Modified Version" of the Document means any work containing the
			
 
				+Document or a portion of it, either copied verbatim, or with
			
 
				+modifications and/or translated into another language.
			
 
				+
			
 
				+A "Secondary Section" is a named appendix or a front-matter section of
			
 
				+the Document that deals exclusively with the relationship of the
			
 
				+publishers or authors of the Document to the Document's overall
			
 
				+subject (or to related matters) and contains nothing that could fall
			
 
				+directly within that overall subject.  (Thus, if the Document is in
			
 
				+part a textbook of mathematics, a Secondary Section may not explain
			
 
				+any mathematics.)  The relationship could be a matter of historical
			
 
				+connection with the subject or with related matters, or of legal,
			
 
				+commercial, philosophical, ethical or political position regarding
			
 
				+them.
			
 
				+
			
 
				+The "Invariant Sections" are certain Secondary Sections whose titles
			
 
				+are designated, as being those of Invariant Sections, in the notice
			
 
				+that says that the Document is released under this License.  If a
			
 
				+section does not fit the above definition of Secondary then it is not
			
 
				+allowed to be designated as Invariant.  The Document may contain zero
			
 
				+Invariant Sections.  If the Document does not identify any Invariant
			
 
				+Sections then there are none.
			
 
				+
			
 
				+The "Cover Texts" are certain short passages of text that are listed,
			
 
				+as Front-Cover Texts or Back-Cover Texts, in the notice that says that
			
 
				+the Document is released under this License.  A Front-Cover Text may
			
 
				+be at most 5 words, and a Back-Cover Text may be at most 25 words.
			
 
				+
			
 
				+A "Transparent" copy of the Document means a machine-readable copy,
			
 
				+represented in a format whose specification is available to the
			
 
				+general public, that is suitable for revising the document
			
 
				+straightforwardly with generic text editors or (for images composed of
			
 
				+pixels) generic paint programs or (for drawings) some widely available
			
 
				+drawing editor, and that is suitable for input to text formatters or
			
 
				+for automatic translation to a variety of formats suitable for input
			
 
				+to text formatters.  A copy made in an otherwise Transparent file
			
 
				+format whose markup, or absence of markup, has been arranged to thwart
			
 
				+or discourage subsequent modification by readers is not Transparent.
			
 
				+An image format is not Transparent if used for any substantial amount
			
 
				+of text.  A copy that is not "Transparent" is called "Opaque".
			
 
				+
			
 
				+Examples of suitable formats for Transparent copies include plain
			
 
				+ASCII without markup, Texinfo input format, LaTeX input format, SGML
			
 
				+or XML using a publicly available DTD, and standard-conforming simple
			
 
				+HTML, PostScript or PDF designed for human modification.  Examples of
			
 
				+transparent image formats include PNG, XCF and JPG.  Opaque formats
			
 
				+include proprietary formats that can be read and edited only by
			
 
				+proprietary word processors, SGML or XML for which the DTD and/or
			
 
				+processing tools are not generally available, and the
			
 
				+machine-generated HTML, PostScript or PDF produced by some word
			
 
				+processors for output purposes only.
			
 
				+
			
 
				+The "Title Page" means, for a printed book, the title page itself,
			
 
				+plus such following pages as are needed to hold, legibly, the material
			
 
				+this License requires to appear in the title page.  For works in
			
 
				+formats which do not have any title page as such, "Title Page" means
			
 
				+the text near the most prominent appearance of the work's title,
			
 
				+preceding the beginning of the body of the text.
			
 
				+
			
 
				+The "publisher" means any person or entity that distributes copies of
			
 
				+the Document to the public.
			
 
				+
			
 
				+A section "Entitled XYZ" means a named subunit of the Document whose
			
 
				+title either is precisely XYZ or contains XYZ in parentheses following
			
 
				+text that translates XYZ in another language.  (Here XYZ stands for a
			
 
				+specific section name mentioned below, such as "Acknowledgements",
			
 
				+"Dedications", "Endorsements", or "History".)  To "Preserve the Title"
			
 
				+of such a section when you modify the Document means that it remains a
			
 
				+section "Entitled XYZ" according to this definition.
			
 
				+
			
 
				+The Document may include Warranty Disclaimers next to the notice which
			
 
				+states that this License applies to the Document.  These Warranty
			
 
				+Disclaimers are considered to be included by reference in this
			
 
				+License, but only as regards disclaiming warranties: any other
			
 
				+implication that these Warranty Disclaimers may have is void and has
			
 
				+no effect on the meaning of this License.
			
 
				+
			
 
				+2. VERBATIM COPYING
			
 
				+
			
 
				+You may copy and distribute the Document in any medium, either
			
 
				+commercially or noncommercially, provided that this License, the
			
 
				+copyright notices, and the license notice saying this License applies
			
 
				+to the Document are reproduced in all copies, and that you add no
			
 
				+other conditions whatsoever to those of this License.  You may not use
			
 
				+technical measures to obstruct or control the reading or further
			
 
				+copying of the copies you make or distribute.  However, you may accept
			
 
				+compensation in exchange for copies.  If you distribute a large enough
			
 
				+number of copies you must also follow the conditions in section 3.
			
 
				+
			
 
				+You may also lend copies, under the same conditions stated above, and
			
 
				+you may publicly display copies.
			
 
				+
			
 
				+
			
 
				+3. COPYING IN QUANTITY
			
 
				+
			
 
				+If you publish printed copies (or copies in media that commonly have
			
 
				+printed covers) of the Document, numbering more than 100, and the
			
 
				+Document's license notice requires Cover Texts, you must enclose the
			
 
				+copies in covers that carry, clearly and legibly, all these Cover
			
 
				+Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
			
 
				+the back cover.  Both covers must also clearly and legibly identify
			
 
				+you as the publisher of these copies.  The front cover must present
			
 
				+the full title with all words of the title equally prominent and
			
 
				+visible.  You may add other material on the covers in addition.
			
 
				+Copying with changes limited to the covers, as long as they preserve
			
 
				+the title of the Document and satisfy these conditions, can be treated
			
 
				+as verbatim copying in other respects.
			
 
				+
			
 
				+If the required texts for either cover are too voluminous to fit
			
 
				+legibly, you should put the first ones listed (as many as fit
			
 
				+reasonably) on the actual cover, and continue the rest onto adjacent
			
 
				+pages.
			
 
				+
			
 
				+If you publish or distribute Opaque copies of the Document numbering
			
 
				+more than 100, you must either include a machine-readable Transparent
			
 
				+copy along with each Opaque copy, or state in or with each Opaque copy
			
 
				+a computer-network location from which the general network-using
			
 
				+public has access to download using public-standard network protocols
			
 
				+a complete Transparent copy of the Document, free of added material.
			
 
				+If you use the latter option, you must take reasonably prudent steps,
			
 
				+when you begin distribution of Opaque copies in quantity, to ensure
			
 
				+that this Transparent copy will remain thus accessible at the stated
			
 
				+location until at least one year after the last time you distribute an
			
 
				+Opaque copy (directly or through your agents or retailers) of that
			
 
				+edition to the public.
			
 
				+
			
 
				+It is requested, but not required, that you contact the authors of the
			
 
				+Document well before redistributing any large number of copies, to
			
 
				+give them a chance to provide you with an updated version of the
			
 
				+Document.
			
 
				+
			
 
				+
			
 
				+4. MODIFICATIONS
			
 
				+
			
 
				+You may copy and distribute a Modified Version of the Document under
			
 
				+the conditions of sections 2 and 3 above, provided that you release
			
 
				+the Modified Version under precisely this License, with the Modified
			
 
				+Version filling the role of the Document, thus licensing distribution
			
 
				+and modification of the Modified Version to whoever possesses a copy
			
 
				+of it.  In addition, you must do these things in the Modified Version:
			
 
				+
			
 
				+A. Use in the Title Page (and on the covers, if any) a title distinct
			
 
				+   from that of the Document, and from those of previous versions
			
 
				+   (which should, if there were any, be listed in the History section
			
 
				+   of the Document).  You may use the same title as a previous version
			
 
				+   if the original publisher of that version gives permission.
			
 
				+B. List on the Title Page, as authors, one or more persons or entities
			
 
				+   responsible for authorship of the modifications in the Modified
			
 
				+   Version, together with at least five of the principal authors of the
			
 
				+   Document (all of its principal authors, if it has fewer than five),
			
 
				+   unless they release you from this requirement.
			
 
				+C. State on the Title page the name of the publisher of the
			
 
				+   Modified Version, as the publisher.
			
 
				+D. Preserve all the copyright notices of the Document.
			
 
				+E. Add an appropriate copyright notice for your modifications
			
 
				+   adjacent to the other copyright notices.
			
 
				+F. Include, immediately after the copyright notices, a license notice
			
 
				+   giving the public permission to use the Modified Version under the
			
 
				+   terms of this License, in the form shown in the Addendum below.
			
 
				+G. Preserve in that license notice the full lists of Invariant Sections
			
 
				+   and required Cover Texts given in the Document's license notice.
			
 
				+H. Include an unaltered copy of this License.
			
 
				+I. Preserve the section Entitled "History", Preserve its Title, and add
			
 
				+   to it an item stating at least the title, year, new authors, and
			
 
				+   publisher of the Modified Version as given on the Title Page.  If
			
 
				+   there is no section Entitled "History" in the Document, create one
			
 
				+   stating the title, year, authors, and publisher of the Document as
			
 
				+   given on its Title Page, then add an item describing the Modified
			
 
				+   Version as stated in the previous sentence.
			
 
				+J. Preserve the network location, if any, given in the Document for
			
 
				+   public access to a Transparent copy of the Document, and likewise
			
 
				+   the network locations given in the Document for previous versions
			
 
				+   it was based on.  These may be placed in the "History" section.
			
 
				+   You may omit a network location for a work that was published at
			
 
				+   least four years before the Document itself, or if the original
			
 
				+   publisher of the version it refers to gives permission.
			
 
				+K. For any section Entitled "Acknowledgements" or "Dedications",
			
 
				+   Preserve the Title of the section, and preserve in the section all
			
 
				+   the substance and tone of each of the contributor acknowledgements
			
 
				+   and/or dedications given therein.
			
 
				+L. Preserve all the Invariant Sections of the Document,
			
 
				+   unaltered in their text and in their titles.  Section numbers
			
 
				+   or the equivalent are not considered part of the section titles.
			
 
				+M. Delete any section Entitled "Endorsements".  Such a section
			
 
				+   may not be included in the Modified Version.
			
 
				+N. Do not retitle any existing section to be Entitled "Endorsements"
			
 
				+   or to conflict in title with any Invariant Section.
			
 
				+O. Preserve any Warranty Disclaimers.
			
 
				+
			
 
				+If the Modified Version includes new front-matter sections or
			
 
				+appendices that qualify as Secondary Sections and contain no material
			
 
				+copied from the Document, you may at your option designate some or all
			
 
				+of these sections as invariant.  To do this, add their titles to the
			
 
				+list of Invariant Sections in the Modified Version's license notice.
			
 
				+These titles must be distinct from any other section titles.
			
 
				+
			
 
				+You may add a section Entitled "Endorsements", provided it contains
			
 
				+nothing but endorsements of your Modified Version by various
			
 
				+parties--for example, statements of peer review or that the text has
			
 
				+been approved by an organization as the authoritative definition of a
			
 
				+standard.
			
 
				+
			
 
				+You may add a passage of up to five words as a Front-Cover Text, and a
			
 
				+passage of up to 25 words as a Back-Cover Text, to the end of the list
			
 
				+of Cover Texts in the Modified Version.  Only one passage of
			
 
				+Front-Cover Text and one of Back-Cover Text may be added by (or
			
 
				+through arrangements made by) any one entity.  If the Document already
			
 
				+includes a cover text for the same cover, previously added by you or
			
 
				+by arrangement made by the same entity you are acting on behalf of,
			
 
				+you may not add another; but you may replace the old one, on explicit
			
 
				+permission from the previous publisher that added the old one.
			
 
				+
			
 
				+The author(s) and publisher(s) of the Document do not by this License
			
 
				+give permission to use their names for publicity for or to assert or
			
 
				+imply endorsement of any Modified Version.
			
 
				+
			
 
				+
			
 
				+5. COMBINING DOCUMENTS
			
 
				+
			
 
				+You may combine the Document with other documents released under this
			
 
				+License, under the terms defined in section 4 above for modified
			
 
				+versions, provided that you include in the combination all of the
			
 
				+Invariant Sections of all of the original documents, unmodified, and
			
 
				+list them all as Invariant Sections of your combined work in its
			
 
				+license notice, and that you preserve all their Warranty Disclaimers.
			
 
				+
			
 
				+The combined work need only contain one copy of this License, and
			
 
				+multiple identical Invariant Sections may be replaced with a single
			
 
				+copy.  If there are multiple Invariant Sections with the same name but
			
 
				+different contents, make the title of each such section unique by
			
 
				+adding at the end of it, in parentheses, the name of the original
			
 
				+author or publisher of that section if known, or else a unique number.
			
 
				+Make the same adjustment to the section titles in the list of
			
 
				+Invariant Sections in the license notice of the combined work.
			
 
				+
			
 
				+In the combination, you must combine any sections Entitled "History"
			
 
				+in the various original documents, forming one section Entitled
			
 
				+"History"; likewise combine any sections Entitled "Acknowledgements",
			
 
				+and any sections Entitled "Dedications".  You must delete all sections
			
 
				+Entitled "Endorsements".
			
 
				+
			
 
				+
			
 
				+6. COLLECTIONS OF DOCUMENTS
			
 
				+
			
 
				+You may make a collection consisting of the Document and other
			
 
				+documents released under this License, and replace the individual
			
 
				+copies of this License in the various documents with a single copy
			
 
				+that is included in the collection, provided that you follow the rules
			
 
				+of this License for verbatim copying of each of the documents in all
			
 
				+other respects.
			
 
				+
			
 
				+You may extract a single document from such a collection, and
			
 
				+distribute it individually under this License, provided you insert a
			
 
				+copy of this License into the extracted document, and follow this
			
 
				+License in all other respects regarding verbatim copying of that
			
 
				+document.
			
 
				+
			
 
				+
			
 
				+7. AGGREGATION WITH INDEPENDENT WORKS
			
 
				+
			
 
				+A compilation of the Document or its derivatives with other separate
			
 
				+and independent documents or works, in or on a volume of a storage or
			
 
				+distribution medium, is called an "aggregate" if the copyright
			
 
				+resulting from the compilation is not used to limit the legal rights
			
 
				+of the compilation's users beyond what the individual works permit.
			
 
				+When the Document is included in an aggregate, this License does not
			
 
				+apply to the other works in the aggregate which are not themselves
			
 
				+derivative works of the Document.
			
 
				+
			
 
				+If the Cover Text requirement of section 3 is applicable to these
			
 
				+copies of the Document, then if the Document is less than one half of
			
 
				+the entire aggregate, the Document's Cover Texts may be placed on
			
 
				+covers that bracket the Document within the aggregate, or the
			
 
				+electronic equivalent of covers if the Document is in electronic form.
			
 
				+Otherwise they must appear on printed covers that bracket the whole
			
 
				+aggregate.
			
 
				+
			
 
				+
			
 
				+8. TRANSLATION
			
 
				+
			
 
				+Translation is considered a kind of modification, so you may
			
 
				+distribute translations of the Document under the terms of section 4.
			
 
				+Replacing Invariant Sections with translations requires special
			
 
				+permission from their copyright holders, but you may include
			
 
				+translations of some or all Invariant Sections in addition to the
			
 
				+original versions of these Invariant Sections.  You may include a
			
 
				+translation of this License, and all the license notices in the
			
 
				+Document, and any Warranty Disclaimers, provided that you also include
			
 
				+the original English version of this License and the original versions
			
 
				+of those notices and disclaimers.  In case of a disagreement between
			
 
				+the translation and the original version of this License or a notice
			
 
				+or disclaimer, the original version will prevail.
			
 
				+
			
 
				+If a section in the Document is Entitled "Acknowledgements",
			
 
				+"Dedications", or "History", the requirement (section 4) to Preserve
			
 
				+its Title (section 1) will typically require changing the actual
			
 
				+title.
			
 
				+
			
 
				+
			
 
				+9. TERMINATION
			
 
				+
			
 
				+You may not copy, modify, sublicense, or distribute the Document
			
 
				+except as expressly provided under this License.  Any attempt
			
 
				+otherwise to copy, modify, sublicense, or distribute it is void, and
			
 
				+will automatically terminate your rights under this License.
			
 
				+
			
 
				+However, if you cease all violation of this License, then your license
			
 
				+from a particular copyright holder is reinstated (a) provisionally,
			
 
				+unless and until the copyright holder explicitly and finally
			
 
				+terminates your license, and (b) permanently, if the copyright holder
			
 
				+fails to notify you of the violation by some reasonable means prior to
			
 
				+60 days after the cessation.
			
 
				+
			
 
				+Moreover, your license from a particular copyright holder is
			
 
				+reinstated permanently if the copyright holder notifies you of the
			
 
				+violation by some reasonable means, this is the first time you have
			
 
				+received notice of violation of this License (for any work) from that
			
 
				+copyright holder, and you cure the violation prior to 30 days after
			
 
				+your receipt of the notice.
			
 
				+
			
 
				+Termination of your rights under this section does not terminate the
			
 
				+licenses of parties who have received copies or rights from you under
			
 
				+this License.  If your rights have been terminated and not permanently
			
 
				+reinstated, receipt of a copy of some or all of the same material does
			
 
				+not give you any rights to use it.
			
 
				+
			
 
				+
			
 
				+10. FUTURE REVISIONS OF THIS LICENSE
			
 
				+
			
 
				+The Free Software Foundation may publish new, revised versions of the
			
 
				+GNU Free Documentation License from time to time.  Such new versions
			
 
				+will be similar in spirit to the present version, but may differ in
			
 
				+detail to address new problems or concerns.  See
			
 
				+http://www.gnu.org/copyleft/.
			
 
				+
			
 
				+Each version of the License is given a distinguishing version number.
			
 
				+If the Document specifies that a particular numbered version of this
			
 
				+License "or any later version" applies to it, you have the option of
			
 
				+following the terms and conditions either of that specified version or
			
 
				+of any later version that has been published (not as a draft) by the
			
 
				+Free Software Foundation.  If the Document does not specify a version
			
 
				+number of this License, you may choose any version ever published (not
			
 
				+as a draft) by the Free Software Foundation.  If the Document
			
 
				+specifies that a proxy can decide which future versions of this
			
 
				+License can be used, that proxy's public statement of acceptance of a
			
 
				+version permanently authorizes you to choose that version for the
			
 
				+Document.
			
 
				+
			
 
				+11. RELICENSING
			
 
				+
			
 
				+"Massive Multiauthor Collaboration Site" (or "MMC Site") means any
			
 
				+World Wide Web server that publishes copyrightable works and also
			
 
				+provides prominent facilities for anybody to edit those works.  A
			
 
				+public wiki that anybody can edit is an example of such a server.  A
			
 
				+"Massive Multiauthor Collaboration" (or "MMC") contained in the site
			
 
				+means any set of copyrightable works thus published on the MMC site.
			
 
				+
			
 
				+"CC-BY-SA" means the Creative Commons Attribution-Share Alike 3.0 
			
 
				+license published by Creative Commons Corporation, a not-for-profit 
			
 
				+corporation with a principal place of business in San Francisco, 
			
 
				+California, as well as future copyleft versions of that license 
			
 
				+published by that same organization.
			
 
				+
			
 
				+"Incorporate" means to publish or republish a Document, in whole or in 
			
 
				+part, as part of another Document.
			
 
				+
			
 
				+An MMC is "eligible for relicensing" if it is licensed under this 
			
 
				+License, and if all works that were first published under this License 
			
 
				+somewhere other than this MMC, and subsequently incorporated in whole or 
			
 
				+in part into the MMC, (1) had no cover texts or invariant sections, and 
			
 
				+(2) were thus incorporated prior to November 1, 2008.
			
 
				+
			
 
				+The operator of an MMC Site may republish an MMC contained in the site
			
 
				+under CC-BY-SA on the same site at any time before August 1, 2009,
			
 
				+provided the MMC is eligible for relicensing.
			
 
				+
			
 
				+
			
 
				+ADDENDUM: How to use this License for your documents
			
 
				+
			
 
				+To use this License in a document you have written, include a copy of
			
 
				+the License in the document and put the following copyright and
			
 
				+license notices just after the title page:
			
 
				+
			
 
				+    Copyright (c)  YEAR  YOUR NAME.
			
 
				+    Permission is granted to copy, distribute and/or modify this document
			
 
				+    under the terms of the GNU Free Documentation License, Version 1.3
			
 
				+    or any later version published by the Free Software Foundation;
			
 
				+    with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
			
 
				+    A copy of the license is included in the section entitled "GNU
			
 
				+    Free Documentation License".
			
 
				+
			
 
				+If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
			
 
				+replace the "with...Texts." line with this:
			
 
				+
			
 
				+    with the Invariant Sections being LIST THEIR TITLES, with the
			
 
				+    Front-Cover Texts being LIST, and with the Back-Cover Texts being LIST.
			
 
				+
			
 
				+If you have Invariant Sections without Cover Texts, or some other
			
 
				+combination of the three, merge those two alternatives to suit the
			
 
				+situation.
			
 
				+
			
 
				+If your document contains nontrivial examples of program code, we
			
 
				+recommend releasing these examples in parallel under your choice of
			
 
				+free software license, such as the GNU General Public License,
			
 
				+to permit their use in free software.
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,32 +1,44 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009, 2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				+# Permission is granted to copy, distribute and/or modify this document
			
 
				+# under the terms of the GNU Free Documentation License, Version 1.3
			
 
				+# or any later version published by the Free Software Foundation;
			
 
				+# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
			
 
				 #
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-#
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+# See the GNU Free Documentation License in COPYING.GFDL for more details.
			
 
				 
			
 
				 info_TEXINFOS = starpu.texi
			
 
				 
			
 
				-starpu_TEXINFOS = c-extensions.texi
			
 
				+starpu_TEXINFOS = chapters/advanced-api.texi \
			
 
				+	chapters/configuration.texi \
			
 
				+	chapters/perf-feedback.texi \
			
 
				+	chapters/vector_scal_cpu.texi \
			
 
				+	chapters/advanced-examples.texi \
			
 
				+	chapters/fdl-1.3.texi \
			
 
				+	chapters/perf-optimization.texi \
			
 
				+	chapters/vector_scal_c.texi \
			
 
				+	chapters/basic-api.texi \
			
 
				+	chapters/installing.texi \
			
 
				+	chapters/scaling-vector-example.texi \
			
 
				+	chapters/vector_scal_cuda.texi \
			
 
				+	chapters/basic-examples.texi \
			
 
				+	chapters/introduction.texi \
			
 
				+	chapters/tips-tricks.texi \
			
 
				+	chapters/vector_scal_opencl_codelet.texi \
			
 
				+	chapters/c-extensions.texi \
			
 
				+	chapters/mpi-support.texi \
			
 
				+	chapters/fft-support.texi \
			
 
				+	chapters/using.texi \
			
 
				+	chapters/vector_scal_opencl.texi \
			
 
				+	chapters/socl.texi
			
 
				 
			
 
				 MAINTAINERCLEANFILES = starpu.pdf
			
 
				 
			
 
				 EXTRA_DIST = starpu.pdf \
			
 
				-	starpu.css \
			
 
				-	vector_scal_c.texi \
			
 
				-	vector_scal_cpu.texi \
			
 
				-	vector_scal_cuda.texi \
			
 
				-	vector_scal_opencl_codelet.texi \
			
 
				-	vector_scal_opencl.texi
			
 
				+	starpu.css
			
 
				 
			
 
				 AM_MAKEINFOHTMLFLAGS = --css-include=$(top_srcdir)/doc/starpu.css --no-headers --no-split
			
 
				 
			
@@ -45,3 +57,10 @@ uninstall-local:
 
				 #
			
 
				 #CLEANFILES= \
			
 
				 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
			
 
				+
			
 
				+# Rule to update documentation on web server. Should only be used locally.
			
 
				+update-web:
			
 
				+	scp starpu.pdf starpu.html sync:/web/runtime/html/StarPU
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat /dev/null
			
--- a/doc/c-extensions.texi
+++ b/doc/c-extensions.texi
@@ -1,160 +0,0 @@
 
				-@c This is part of the StarPU Handbook.
			
 
				-@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				-
			
 
				-@node C Extensions
			
 
				-@chapter C Extensions
			
 
				-
			
 
				-@cindex C extensions
			
 
				-@cindex GCC plug-in
			
 
				-
			
 
				-When configured with @code{--enable-gcc-extensions}, StarPU builds a
			
 
				-plug-in for the GNU Compiler Collection (GCC), which defines extensions
			
 
				-to the C language that make it easier to write StarPU code@footnote{This
			
 
				-feature is only available for GCC 4.5 and later.}.  Those extensions
			
 
				-include syntactic sugar for defining tasks and their implementations,
			
 
				-invoking a task, and manipulating data buffers.
			
 
				-
			
 
				-This section does not require detailed knowledge of the StarPU library.
			
 
				-
			
 
				-Note: as of StarPU @value{VERSION}, this is still an area under
			
 
				-development and subject to change.
			
 
				-
			
 
				-@menu
			
 
				-* Defining Tasks::              Defining StarPU tasks
			
 
				-* Registered Data Buffers::     Manipulating data buffers
			
 
				-@end menu
			
 
				-
			
 
				-@node Defining Tasks
			
 
				-@section Defining Tasks
			
 
				-
			
 
				-@cindex task
			
 
				-@cindex task implementation
			
 
				-
			
 
				-The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
			
 
				-
			
 
				-@enumerate
			
 
				-@item
			
 
				-tasks may have several implementations---e.g., one for CPUs, one written
			
 
				-in OpenCL, one written in CUDA;
			
 
				-@item
			
 
				-when a task is invoked, it may run in parallel, and StarPU is free to
			
 
				-choose any of its implementations.
			
 
				-@end enumerate
			
 
				-
			
 
				-Tasks and their implementations must be @emph{declared}.  These
			
 
				-declarations are annotated with @dfn{attributes} (@pxref{Attribute
			
 
				-Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
			
 
				-(GCC)}): the declaration of a task is a regular C function declaration
			
 
				-with an additional @code{task} attribute, and task implementations are
			
 
				-declared with a @code{task_implementation} attribute.
			
 
				-
			
 
				-The following function attributes are provided:
			
 
				-
			
 
				-@table @code
			
 
				-
			
 
				-@item task
			
 
				-@cindex @code{task} attribute
			
 
				-Declare the given function as a StarPU task.  Its return type must be
			
 
				-@code{void}, and it must not be defined---instead, a definition will
			
 
				-automatically be provided by the compiler.
			
 
				-
			
 
				-Under the hood, declaring a task leads to the declaration of the
			
 
				-corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
			
 
				-more task implementations are declared in the same compilation unit,
			
 
				-then the codelet and the function itself are also defined; they inherit
			
 
				-the scope of the task.
			
 
				-
			
 
				-Scalar arguments to the task are passed by value and copied to the
			
 
				-target device if need be---technically, they are passed as the
			
 
				-@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
			
 
				-
			
 
				-Pointer arguments are assumed to be registered data buffers---the
			
 
				-@code{buffers} argument of a task (@pxref{Codelets and Tasks,
			
 
				-@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
			
 
				-read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
			
 
				-buffers are assumed to be used read-write (@code{STARPU_RW}).
			
 
				-
			
 
				-@item task_implementation (@var{target}, @var{task})
			
 
				-@cindex @code{task_implementation} attribute
			
 
				-Declare the given function as an implementation of @var{task} to run on
			
 
				-@var{target}.  @var{target} must be a string, currently one of
			
 
				-@code{"cpu"} or @code{"cuda"}.
			
 
				-@c FIXME: Update when OpenCL support is ready.
			
 
				-
			
 
				-@end table
			
 
				-
			
 
				-Here is an example:
			
 
				-
			
 
				-@example
			
 
				-static void matmul (const float *A, const float *B, float *C,
			
 
				-		    size_t nx, size_t ny, size_t nz)
			
 
				-  __attribute__ ((task));
			
 
				-
			
 
				-static void matmul_cpu (const float *A, const float *B, float *C,
			
 
				-			size_t nx, size_t ny, size_t nz)
			
 
				-  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				-
			
 
				-
			
 
				-static void
			
 
				-matmul_cpu (const float *A, const float *B, float *C,
			
 
				-	    size_t nx, size_t ny, size_t nz)
			
 
				-@{
			
 
				-  size_t i, j, k;
			
 
				-
			
 
				-  for (j = 0; j < ny; j++)
			
 
				-    for (i = 0; i < nx; i++)
			
 
				-      @{
			
 
				-	for (k = 0; k < nz; k++)
			
 
				-	  C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				-      @}
			
 
				-@}
			
 
				-@end example
			
 
				-
			
 
				-@noindent
			
 
				-A @code{matmult} task is defined; it has only one implementation,
			
 
				-@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
			
 
				-@var{B} are input buffers, whereas @var{C} is considered an input/output
			
 
				-buffer.  The task can be invoked like a regular C function:
			
 
				-
			
 
				-@example
			
 
				-matmul (&A[i * zdim * bydim + k * bzdim * bydim],
			
 
				-        &B[k * xdim * bzdim + j * bxdim * bzdim],
			
 
				-        &C[i * xdim * bydim + j * bxdim * bydim],
			
 
				-        bxdim, bydim, bzdim);
			
 
				-@end example
			
 
				-
			
 
				-@noindent
			
 
				-This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
			
 
				-implementation may run in parallel with the continuation of the caller.
			
 
				-
			
 
				-The next section describes how memory buffers must be handled in
			
 
				-StarPU-GCC code.
			
 
				-
			
 
				-
			
 
				-@node Registered Data Buffers
			
 
				-@section Registered Data Buffers
			
 
				-
			
 
				-Data buffers such as matrices and vectors that are to be passed to tasks
			
 
				-must be @dfn{registered}.  Registration allows StarPU to handle data
			
 
				-transfers among devices---e.g., transferring an input buffer from the
			
 
				-CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
			
 
				-Management Library}).
			
 
				-
			
 
				-The following pragmas are provided:
			
 
				-
			
 
				-@table @code
			
 
				-
			
 
				-@item #pragma starpu register @var{ptr} [@var{size}]
			
 
				-Register @var{ptr} as a @var{size}-element buffer.
			
 
				-
			
 
				-@item #pragma starpu unregister @var{ptr}
			
 
				-@item #pragma starpu acquire @var{ptr}
			
 
				-
			
 
				-@end table
			
 
				-
			
 
				-FIXME: finish
			
 
				-
			
 
				-@c Local Variables:
			
 
				-@c TeX-master: "guile.texi"
			
 
				-@c ispell-local-dictionary: "american"
			
 
				-@c End:
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -0,0 +1,701 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Defining a new data interface::  
			
 
				+* Multiformat Data Interface::  
			
 
				+* Task Bundles::                
			
 
				+* Task Lists::                  
			
 
				+* Using Parallel Tasks::        
			
 
				+* Defining a new scheduling policy::  
			
 
				+* Expert mode::                 
			
 
				+@end menu
			
 
				+
			
 
				+@node Defining a new data interface
			
 
				+@section Defining a new data interface
			
 
				+
			
 
				+@menu
			
 
				+* Data Interface API::  Data Interface API
			
 
				+* An example of data interface::        An example of data interface
			
 
				+@end menu
			
 
				+
			
 
				+@node Data Interface API
			
 
				+@subsection Data Interface API
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_data_interface_ops}
			
 
				+@anchor{struct starpu_data_interface_ops}
			
 
				+Per-interface data transfer methods.
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{void (*register_data_handle)(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)}
			
 
				+Register an existing interface into a data handle.
			
 
				+
			
 
				+@item @code{starpu_ssize_t (*allocate_data_on_node)(void *data_interface, uint32_t node)}
			
 
				+Allocate data for the interface on a given node.
			
 
				+
			
 
				+@item @code{ void (*free_data_on_node)(void *data_interface, uint32_t node)}
			
 
				+Free data of the interface on a given node.
			
 
				+
			
 
				+@item @code{ const struct starpu_data_copy_methods *copy_methods}
			
 
				+ram/cuda/spu/opencl synchronous and asynchronous transfer methods.
			
 
				+
			
 
				+@item @code{ void * (*handle_to_pointer)(starpu_data_handle_t handle, uint32_t node)}
			
 
				+Return the current pointer (if any) for the handle on the given node.
			
 
				+
			
 
				+@item @code{ size_t (*get_size)(starpu_data_handle_t handle)}
			
 
				+Return an estimation of the size of data, for performance models.
			
 
				+
			
 
				+@item @code{ uint32_t (*footprint)(starpu_data_handle_t handle)}
			
 
				+Return a 32bit footprint which characterizes the data size.
			
 
				+
			
 
				+@item @code{ int (*compare)(void *data_interface_a, void *data_interface_b)}
			
 
				+Compare the data size of two interfaces.
			
 
				+
			
 
				+@item @code{ void (*display)(starpu_data_handle_t handle, FILE *f)}
			
 
				+Dump the sizes of a handle to a file.
			
 
				+
			
 
				+@item @code{ int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)}
			
 
				+Convert the data size to the spu size format. If no SPUs are used, this field can be seto NULL.
			
 
				+
			
 
				+@item @code{enum starpu_data_interface_id interfaceid}
			
 
				+An identifier that is unique to each interface.
			
 
				+
			
 
				+@item @code{size_t interface_size}
			
 
				+The size of the interface data descriptor.
			
 
				+
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_data_copy_methods}
			
 
				+Defines the per-interface methods.
			
 
				+@table @asis
			
 
				+@item @code{int @{ram,cuda,opencl,spu@}_to_@{ram,cuda,opencl,spu@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
			
 
				+These 16 functions define how to copy data from the @var{src_interface}
			
 
				+interface on the @var{src_node} node to the @var{dst_interface} interface
			
 
				+on the @var{dst_node} node. They return 0 on success.
			
 
				+
			
 
				+@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				+@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
			
 
				+on success.
			
 
				+
			
 
				+@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
			
 
				+@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
			
 
				+on success.
			
 
				+
			
 
				+@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
			
 
				+the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
			
 
				+Return 0 on success.
			
 
				+
			
 
				+@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				+@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
			
 
				+cl_event. Return 0 on success.
			
 
				+
			
 
				+@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				+on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
			
 
				+a cl_event. Return 0 on success.
			
 
				+
			
 
				+@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+Define how to copy data from the @var{src_interface} interface on the
			
 
				+@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				+on the @var{dst_node} node (on another OpenCL device), using the given
			
 
				+@var{event}, a pointer to a cl_event. Return 0 on success.
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
			
 
				+Compute the CRC of a byte buffer seeded by the inputcrc "current
			
 
				+state". The return value should be considered as the new "current
			
 
				+state" for future CRC computation. This is used for computing data size
			
 
				+footprint.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun uint32_t starpu_crc32_be (uint32_t @var{input}, uint32_t @var{inputcrc})
			
 
				+Compute the CRC of a 32bit number seeded by the inputcrc "current
			
 
				+state". The return value should be considered as the new "current
			
 
				+state" for future CRC computation. This is used for computing data size
			
 
				+footprint.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun uint32_t starpu_crc32_string ({char *}@var{str}, uint32_t @var{inputcrc})
			
 
				+Compute the CRC of a string seeded by the inputcrc "current state".
			
 
				+The return value should be considered as the new "current state" for
			
 
				+future CRC computation. This is used for computing data size footprint.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node An example of data interface
			
 
				+@subsection An example of data interface
			
 
				+
			
 
				+@deftypefun int starpu_data_interface_get_next_id ()
			
 
				+Returns the next available id for a newly created data interface.
			
 
				+@end deftypefun
			
 
				+
			
 
				+Let's define a new data interface to manage complex numbers.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* interface for complex numbers */
			
 
				+struct starpu_complex_interface
			
 
				+@{
			
 
				+        double *real;
			
 
				+        double *imaginary;
			
 
				+        int nx;
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Registering such a data to StarPU is easily done using the function
			
 
				+@code{starpu_data_register} (@pxref{Basic Data Library API}). The last
			
 
				+parameter of the function, @code{interface_complex_ops}, will be
			
 
				+described below.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void starpu_complex_data_register(starpu_data_handle_t *handle,
			
 
				+     uint32_t home_node, double *real, double *imaginary, int nx)
			
 
				+@{
			
 
				+        struct starpu_complex_interface complex =
			
 
				+        @{
			
 
				+                .real = real,
			
 
				+                .imaginary = imaginary,
			
 
				+                .nx = nx
			
 
				+        @};
			
 
				+
			
 
				+        if (interface_complex_ops.interfaceid == -1)
			
 
				+        @{
			
 
				+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+        @}
			
 
				+
			
 
				+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Different operations need to be defined for a data interface through
			
 
				+the type @code{struct starpu_data_interface_ops} (@pxref{Data
			
 
				+Interface API}). We only define here the basic operations needed to
			
 
				+run simple applications. The source code for the different functions
			
 
				+can be found in the file
			
 
				+@code{examples/interface/complex_interface.c}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static struct starpu_data_interface_ops interface_complex_ops =
			
 
				+@{
			
 
				+        .register_data_handle = complex_register_data_handle,
			
 
				+        .allocate_data_on_node = complex_allocate_data_on_node,
			
 
				+        .copy_methods = &complex_copy_methods,
			
 
				+        .get_size = complex_get_size,
			
 
				+        .footprint = complex_footprint,
			
 
				+        .interfaceid = -1,
			
 
				+        .interface_size = sizeof(struct starpu_complex_interface),
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Functions need to be defined to access the different fields of the
			
 
				+complex interface from a StarPU data handle.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+double *starpu_complex_get_real(starpu_data_handle_t handle)
			
 
				+@{
			
 
				+        struct starpu_complex_interface *complex_interface =
			
 
				+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
			
 
				+        return complex_interface->real;
			
 
				+@}
			
 
				+
			
 
				+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
			
 
				+int starpu_complex_get_nx(starpu_data_handle_t handle);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Similar functions need to be defined to access the different fields of the
			
 
				+complex interface from a @code{void *} pointer to be used within codelet
			
 
				+implemetations.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#define STARPU_COMPLEX_GET_REAL(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->real)
			
 
				+#define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->imaginary)
			
 
				+#define STARPU_COMPLEX_GET_NX(interface)	\
			
 
				+        (((struct starpu_complex_interface *)(interface))->nx)
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Complex data interfaces can then be registered to StarPU.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+double real = 45.0;
			
 
				+double imaginary = 12.0;
			
 
				+starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
			
 
				+starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+and used by codelets.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+@{
			
 
				+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				+        int i;
			
 
				+
			
 
				+        for(i=0 ; i<nx ; i++)
			
 
				+        @{
			
 
				+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
			
 
				+        @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The whole code for this complex data interface is available in the
			
 
				+directory @code{examples/interface/}.
			
 
				+
			
 
				+@node Multiformat Data Interface
			
 
				+@section Multiformat Data Interface
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_multiformat_data_interface_ops}
			
 
				+The different fields are:
			
 
				+@table @asis
			
 
				+@item @code{size_t cpu_elemsize}
			
 
				+the size of each element on CPUs,
			
 
				+
			
 
				+@item @code{size_t opencl_elemsize}
			
 
				+the size of each element on OpenCL devices,
			
 
				+
			
 
				+@item @code{struct starpu_codelet *cpu_to_opencl_cl}
			
 
				+pointer to a codelet which converts from CPU to OpenCL
			
 
				+
			
 
				+@item @code{struct starpu_codelet *opencl_to_cpu_cl}
			
 
				+pointer to a codelet which converts from OpenCL to CPU
			
 
				+
			
 
				+@item @code{size_t cuda_elemsize}
			
 
				+the size of each element on CUDA devices,
			
 
				+
			
 
				+@item @code{struct starpu_codelet *cpu_to_cuda_cl}
			
 
				+pointer to a codelet which converts from CPU to CUDA
			
 
				+
			
 
				+@item @code{struct starpu_codelet *cuda_to_cpu_cl}
			
 
				+pointer to a codelet which converts from CUDA to CPU
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_multiformat_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, void *@var{ptr}, uint32_t @var{nobjects}, struct starpu_multiformat_data_interface_ops *@var{format_ops})
			
 
				+Register a piece of data that can be represented in different ways, depending upon
			
 
				+the processing unit that manipulates it. It allows the programmer, for instance, to
			
 
				+use an array of structures when working on a CPU, and a structure of arrays when
			
 
				+working on a GPU.
			
 
				+
			
 
				+@var{nobjects} is the number of elements in the data. @var{format_ops} describes
			
 
				+the format.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@defmac STARPU_MULTIFORMAT_GET_CPU_PTR ({void *}@var{interface})
			
 
				+returns the local pointer to the data with CPU format.
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_MULTIFORMAT_GET_CUDA_PTR ({void *}@var{interface})
			
 
				+returns the local pointer to the data with CUDA format.
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_MULTIFORMAT_GET_OPENCL_PTR ({void *}@var{interface})
			
 
				+returns the local pointer to the data with OpenCL format.
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_MULTIFORMAT_GET_NX  ({void *}@var{interface})
			
 
				+returns the number of elements in the data.
			
 
				+@end defmac
			
 
				+
			
 
				+
			
 
				+@node Task Bundles
			
 
				+@section Task Bundles
			
 
				+
			
 
				+@deftp {Data Type} {starpu_task_bundle_t}
			
 
				+Opaque structure describing a list of tasks that should be scheduled
			
 
				+on the same worker whenever it's possible. It must be considered as a
			
 
				+hint given to the scheduler as there is no guarantee that they will be
			
 
				+executed on the same worker.
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_task_bundle_create ({starpu_task_bundle_t *}@var{bundle})
			
 
				+Factory function creating and initializing @var{bundle}, when the call returns, memory needed is allocated and @var{bundle} is ready to use.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_task_bundle_insert (starpu_task_bundle_t @var{bundle}, {struct starpu_task *}@var{task})
			
 
				+Insert @var{task} in @var{bundle}. Until @var{task} is removed from @var{bundle} its expected length and data transfer time will be considered along those of the other tasks of @var{bundle}.
			
 
				+This function mustn't be called if @var{bundle} is already closed and/or @var{task} is already submitted.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_task_bundle_remove (starpu_task_bundle_t @var{bundle}, {struct starpu_task *}@var{task})
			
 
				+Remove @var{task} from @var{bundle}.
			
 
				+Of course @var{task} must have been previously inserted @var{bundle}.
			
 
				+This function mustn't be called if @var{bundle} is already closed and/or @var{task} is already submitted. Doing so would result in undefined behaviour.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_task_bundle_close (starpu_task_bundle_t @var{bundle})
			
 
				+Inform the runtime that the user won't modify @var{bundle} anymore, it means no more inserting or removing task. Thus the runtime can destroy it when possible.
			
 
				+@end deftypefun
			
 
				+
			
 
				+
			
 
				+@node Task Lists
			
 
				+@section Task Lists
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_task_list}
			
 
				+Stores a double-chained list of tasks
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_task_list_init ({struct starpu_task_list *}@var{list})
			
 
				+Initialize a list structure
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_task_list_push_front ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
			
 
				+Push a task at the front of a list
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_task_list_push_back ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
			
 
				+Push a task at the back of a list
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_front ({struct starpu_task_list *}@var{list})
			
 
				+Get the front of the list (without removing it)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_back ({struct starpu_task_list *}@var{list})
			
 
				+Get the back of the list (without removing it)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_task_list_empty ({struct starpu_task_list *}@var{list})
			
 
				+Test if a list is empty
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_task_list_erase ({struct starpu_task_list *}@var{list}, {struct starpu_task *}@var{task})
			
 
				+Remove an element from the list
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_pop_front ({struct starpu_task_list *}@var{list})
			
 
				+Remove the element at the front of the list
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_pop_back ({struct starpu_task_list *}@var{list})
			
 
				+Remove the element at the back of the list
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_begin ({struct starpu_task_list *}@var{list})
			
 
				+Get the first task of the list.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_end ({struct starpu_task_list *}@var{list})
			
 
				+Get the end of the list.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpu_task_list_next ({struct starpu_task *}@var{task})
			
 
				+Get the next task of the list. This is not erase-safe.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Using Parallel Tasks
			
 
				+@section Using Parallel Tasks
			
 
				+
			
 
				+These are used by parallel tasks:
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_get_size (void)
			
 
				+Return the size of the current combined worker, i.e. the total number of cpus
			
 
				+running the same task in the case of SPMD parallel tasks, or the total number
			
 
				+of threads that the task is allowed to start in the case of FORKJOIN parallel
			
 
				+tasks.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_get_rank (void)
			
 
				+Return the rank of the current thread within the combined worker. Can only be
			
 
				+used in FORKJOIN parallel tasks, to know which part of the task to work on.
			
 
				+@end deftypefun
			
 
				+
			
 
				+Most of these are used for schedulers which support parallel tasks.
			
 
				+
			
 
				+@deftypefun unsigned starpu_combined_worker_get_count (void)
			
 
				+Return the number of different combined workers.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_get_id (void)
			
 
				+Return the identifier of the current combined worker.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_assign_workerid (int @var{nworkers}, int @var{workerid_array}[])
			
 
				+Register a new combined worker and get its identifier
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_get_description (int @var{workerid}, {int *}@var{worker_size}, {int **}@var{combined_workerid})
			
 
				+Get the description of a combined worker
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_combined_worker_can_execute_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned @var{nimpl})
			
 
				+Variant of starpu_worker_can_execute_task compatible with combined workers
			
 
				+@end deftypefun
			
 
				+
			
 
				+
			
 
				+@node Defining a new scheduling policy
			
 
				+@section Defining a new scheduling policy
			
 
				+
			
 
				+TODO
			
 
				+
			
 
				+A full example showing how to define a new scheduling policy is available in
			
 
				+the StarPU sources in the directory @code{examples/scheduler/}.
			
 
				+
			
 
				+@menu
			
 
				+* Scheduling Policy API:: Scheduling Policy API
			
 
				+* Source code::
			
 
				+@end menu
			
 
				+
			
 
				+@node Scheduling Policy API
			
 
				+@subsection Scheduling Policy API
			
 
				+
			
 
				+While StarPU comes with a variety of scheduling policies (@pxref{Task
			
 
				+scheduling policy}), it may sometimes be desirable to implement custom
			
 
				+policies to address specific problems.  The API described below allows
			
 
				+users to write their own scheduling policy.
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_machine_topology}
			
 
				+@table @asis
			
 
				+@item @code{unsigned nworkers}
			
 
				+Total number of workers.
			
 
				+
			
 
				+@item @code{unsigned ncombinedworkers}
			
 
				+Total number of combined workers.
			
 
				+
			
 
				+@item @code{hwloc_topology_t hwtopology}
			
 
				+Topology as detected by hwloc.
			
 
				+
			
 
				+To maintain ABI compatibility when hwloc is not available, the field
			
 
				+is replaced with @code{void *dummy}
			
 
				+
			
 
				+@item @code{unsigned nhwcpus}
			
 
				+Total number of CPUs, as detected by the topology code. May be different from
			
 
				+the actual number of CPU workers.
			
 
				+
			
 
				+@item @code{unsigned nhwcudagpus}
			
 
				+Total number of CUDA devices, as detected. May be different from the actual
			
 
				+number of CUDA workers.
			
 
				+
			
 
				+@item @code{unsigned nhwopenclgpus}
			
 
				+Total number of OpenCL devices, as detected. May be different from the actual
			
 
				+number of CUDA workers.
			
 
				+
			
 
				+@item @code{unsigned ncpus}
			
 
				+Actual number of CPU workers used by StarPU.
			
 
				+
			
 
				+@item @code{unsigned ncudagpus}
			
 
				+Actual number of CUDA workers used by StarPU.
			
 
				+
			
 
				+@item @code{unsigned nopenclgpus}
			
 
				+Actual number of OpenCL workers used by StarPU.
			
 
				+
			
 
				+@item @code{unsigned ngordon_spus}
			
 
				+Actual number of Gordon workers used by StarPU.
			
 
				+
			
 
				+@item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
			
 
				+Indicates the successive cpu identifier that should be used to bind the
			
 
				+workers. It is either filled according to the user's explicit
			
 
				+parameters (from starpu_conf) or according to the STARPU_WORKERS_CPUID env.
			
 
				+variable. Otherwise, a round-robin policy is used to distributed the workers
			
 
				+over the cpus.
			
 
				+
			
 
				+@item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
			
 
				+Indicates the successive cpu identifier that should be used by the CUDA
			
 
				+driver.  It is either filled according to the user's explicit parameters (from
			
 
				+starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
			
 
				+they are taken in ID order.
			
 
				+
			
 
				+@item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
			
 
				+Indicates the successive cpu identifier that should be used by the OpenCL
			
 
				+driver.  It is either filled according to the user's explicit parameters (from
			
 
				+starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
			
 
				+they are taken in ID order.
			
 
				+
			
 
				+
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftp {Data Type} {struct starpu_sched_policy}
			
 
				+This structure contains all the methods that implement a scheduling policy.  An
			
 
				+application may specify which scheduling strategy in the @code{sched_policy}
			
 
				+field of the @code{starpu_conf} structure passed to the @code{starpu_init}
			
 
				+function. The different fields are:
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{void (*init_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
			
 
				+Initialize the scheduling policy.
			
 
				+
			
 
				+@item @code{void (*deinit_sched)(struct starpu_machine_topology *, struct starpu_sched_policy *)}
			
 
				+Cleanup the scheduling policy.
			
 
				+
			
 
				+@item @code{int (*push_task)(struct starpu_task *)}
			
 
				+Insert a task into the scheduler.
			
 
				+
			
 
				+@item @code{void (*push_task_notify)(struct starpu_task *, int workerid)}
			
 
				+Notify the scheduler that a task was pushed on a given worker. This method is
			
 
				+called when a task that was explicitely assigned to a worker becomes ready and
			
 
				+is about to be executed by the worker. This method therefore permits to keep
			
 
				+the state of of the scheduler coherent even when StarPU bypasses the scheduling
			
 
				+strategy.
			
 
				+
			
 
				+@item @code{struct starpu_task *(*pop_task)(void)} (optional)
			
 
				+Get a task from the scheduler. The mutex associated to the worker is already
			
 
				+taken when this method is called. If this method is defined as @code{NULL}, the
			
 
				+worker will only execute tasks from its local queue. In this case, the
			
 
				+@code{push_task} method should use the @code{starpu_push_local_task} method to
			
 
				+assign tasks to the different workers.
			
 
				+
			
 
				+@item @code{struct starpu_task *(*pop_every_task)(void)}
			
 
				+Remove all available tasks from the scheduler (tasks are chained by the means
			
 
				+of the prev and next fields of the starpu_task structure). The mutex associated
			
 
				+to the worker is already taken when this method is called. This is currently
			
 
				+only used by the Gordon driver.
			
 
				+
			
 
				+@item @code{void (*pre_exec_hook)(struct starpu_task *)} (optional)
			
 
				+This method is called every time a task is starting.
			
 
				+
			
 
				+@item @code{void (*post_exec_hook)(struct starpu_task *)} (optional)
			
 
				+This method is called every time a task has been executed.
			
 
				+
			
 
				+@item @code{const char *policy_name} (optional)
			
 
				+Name of the policy.
			
 
				+
			
 
				+@item @code{const char *policy_description} (optional)
			
 
				+Description of the policy.
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+@deftypefun void starpu_worker_set_sched_condition (int @var{workerid}, pthread_cond_t *@var{sched_cond}, pthread_mutex_t *@var{sched_mutex})
			
 
				+This function specifies the condition variable associated to a worker
			
 
				+When there is no available task for a worker, StarPU blocks this worker on a
			
 
				+condition variable. This function specifies which condition variable (and the
			
 
				+associated mutex) should be used to block (and to wake up) a worker. Note that
			
 
				+multiple workers may use the same condition variable. For instance, in the case
			
 
				+of a scheduling strategy with a single task queue, the same condition variable
			
 
				+would be used to block and wake up all workers.
			
 
				+The initialization method of a scheduling strategy (@code{init_sched}) must
			
 
				+call this function once per worker.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_sched_set_min_priority (int @var{min_prio})
			
 
				+Defines the minimum priority level supported by the scheduling policy. The
			
 
				+default minimum priority level is the same as the default priority level which
			
 
				+is 0 by convention.  The application may access that value by calling the
			
 
				+@code{starpu_sched_get_min_priority} function. This function should only be
			
 
				+called from the initialization method of the scheduling policy, and should not
			
 
				+be used directly from the application.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_sched_set_max_priority (int @var{max_prio})
			
 
				+Defines the maximum priority level supported by the scheduling policy. The
			
 
				+default maximum priority level is 1.  The application may access that value by
			
 
				+calling the @code{starpu_sched_get_max_priority} function. This function should
			
 
				+only be called from the initialization method of the scheduling policy, and
			
 
				+should not be used directly from the application.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_sched_get_min_priority (void)
			
 
				+Returns the current minimum priority level supported by the
			
 
				+scheduling policy
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_sched_get_max_priority (void)
			
 
				+Returns the current maximum priority level supported by the
			
 
				+scheduling policy
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_push_local_task (int @var{workerid}, {struct starpu_task} *@var{task}, int @var{back})
			
 
				+The scheduling policy may put tasks directly into a worker's local queue so
			
 
				+that it is not always necessary to create its own queue when the local queue
			
 
				+is sufficient. If @var{back} not null, @var{task} is put at the back of the queue
			
 
				+where the worker will pop tasks first. Setting @var{back} to 0 therefore ensures
			
 
				+a FIFO ordering.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_worker_can_execute_task (unsigned @var{workerid}, {struct starpu_task *}@var{task}, unsigned {nimpl})
			
 
				+Check if the worker specified by workerid can execute the codelet. Schedulers need to call it before assigning a task to a worker, otherwise the task may fail to execute.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_timing_now (void)
			
 
				+Return the current date in µs
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
			
 
				+Returns expected task duration in µs
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_worker_get_relative_speedup ({enum starpu_perf_archtype} @var{perf_archtype})
			
 
				+Returns an estimated speedup factor relative to CPU speed
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_task_expected_data_transfer_time (uint32_t @var{memory_node}, {struct starpu_task *}@var{task})
			
 
				+Returns expected data transfer time in µs
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_data_expected_transfer_time (starpu_data_handle_t @var{handle}, unsigned @var{memory_node}, {enum starpu_access_mode} @var{mode})
			
 
				+Predict the transfer time (in µs) to move a handle to a memory node
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_task_expected_power ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
			
 
				+Returns expected power consumption in J
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun double starpu_task_expected_conversion_time ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned {nimpl})
			
 
				+Returns expected conversion time in ms (multiformat interface only)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Source code
			
 
				+@subsection Source code
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static struct starpu_sched_policy dummy_sched_policy = @{
			
 
				+    .init_sched = init_dummy_sched,
			
 
				+    .deinit_sched = deinit_dummy_sched,
			
 
				+    .push_task = push_task_dummy,
			
 
				+    .push_prio_task = NULL,
			
 
				+    .pop_task = pop_task_dummy,
			
 
				+    .post_exec_hook = NULL,
			
 
				+    .pop_every_task = NULL,
			
 
				+    .policy_name = "dummy",
			
 
				+    .policy_description = "dummy scheduling strategy"
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Expert mode
			
 
				+@section Expert mode
			
 
				+
			
 
				+@deftypefun void starpu_wake_all_blocked_workers (void)
			
 
				+Wake all the workers, so they can inspect data requests and task submissions
			
 
				+again.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_progression_hook_register (unsigned (*@var{func})(void *arg), void *@var{arg})
			
 
				+Register a progression hook, to be called when workers are idle.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_progression_hook_deregister (int @var{hook_id})
			
 
				+Unregister a given progression hook.
			
 
				+@end deftypefun
			
 
				+
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -0,0 +1,886 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Using multiple implementations of a codelet::
			
 
				+* Enabling implementation according to capabilities::
			
 
				+* Task and Worker Profiling::   
			
 
				+* Partitioning Data::           Partitioning Data
			
 
				+* Performance model example::   
			
 
				+* Theoretical lower bound on execution time::  
			
 
				+* Insert Task Utility::          
			
 
				+* Parallel Tasks::
			
 
				+* Debugging::
			
 
				+* The multiformat interface::
			
 
				+* On-GPU rendering::
			
 
				+* More examples::               More examples shipped with StarPU
			
 
				+@end menu
			
 
				+
			
 
				+@node Using multiple implementations of a codelet
			
 
				+@section Using multiple implementations of a codelet
			
 
				+One may want to write multiple implementations of a codelet for a single type of
			
 
				+device and let StarPU choose which one to run. As an example, we will show how
			
 
				+to use SSE to scale a vector. The codelet can be written as follows:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    unsigned int n_iterations = n/4;
			
 
				+    if (n % 4 != 0)
			
 
				+        n_iterations++;
			
 
				+
			
 
				+    __m128 *VECTOR = (__m128*) vector;
			
 
				+    __m128 factor __attribute__((aligned(16)));
			
 
				+    factor = _mm_set1_ps(*(float *) cl_arg);
			
 
				+
			
 
				+    unsigned int i;    
			
 
				+    for (i = 0; i < n_iterations; i++)
			
 
				+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
			
 
				+and @code{pheft} for now) will use the performance models of all the
			
 
				+implementations it was given, and pick the one that seems to be the fastest.
			
 
				+
			
 
				+@node Enabling implementation according to capabilities
			
 
				+@section Enabling implementation according to capabilities
			
 
				+
			
 
				+Some implementations may not run on some devices. For instance, some CUDA
			
 
				+devices do not support double floating point precision, and thus the kernel
			
 
				+execution would just fail; or the device may not have enough shared memory for
			
 
				+the implementation being used. The @code{can_execute} field of the @code{struct
			
 
				+starpu_codelet} structure permits to express this. For instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+@{
			
 
				+  const struct cudaDeviceProp *props;
			
 
				+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+    return 1;
			
 
				+  /* Cuda device */
			
 
				+  props = starpu_cuda_get_device_properties(workerid);
			
 
				+  if (props->major >= 2 || props->minor >= 3)
			
 
				+    /* At least compute capability 1.3, supports doubles */
			
 
				+    return 1;
			
 
				+  /* Old card, does not support doubles */
			
 
				+  return 0;
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU|STARPU_CUDA,
			
 
				+    .can_execute = can_execute,
			
 
				+    .cpu_funcs = @{ cpu_func, NULL @},
			
 
				+    .cuda_funcs = @{ gpu_func, NULL @}
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+This can be essential e.g. when running on a machine which mixes various models
			
 
				+of CUDA devices, to take benefit from the new models without crashing on old models.
			
 
				+
			
 
				+Note: the @code{can_execute} function is called by the scheduler each time it
			
 
				+tries to match a task with a worker, and should thus be very fast. The
			
 
				+@code{starpu_cuda_get_device_properties} provides a quick access to CUDA
			
 
				+properties of CUDA devices to achieve such efficiency.
			
 
				+
			
 
				+Another example is compiling CUDA code for various compute capabilities,
			
 
				+resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
			
 
				+1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
			
 
				+provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
			
 
				+used to rule out the @code{scal_gpu_20} variant on a CUDA device which
			
 
				+will not be able to execute it:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+@{
			
 
				+  const struct cudaDeviceProp *props;
			
 
				+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+    return 1;
			
 
				+  /* Cuda device */
			
 
				+  if (nimpl == 0)
			
 
				+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
			
 
				+    return 1;
			
 
				+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
			
 
				+  props = starpu_cuda_get_device_properties(workerid);
			
 
				+  if (props->major >= 2 || props->minor >= 0)
			
 
				+    /* At least compute capability 2.0, can run it */
			
 
				+    return 1;
			
 
				+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
			
 
				+  return 0;
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU|STARPU_CUDA,
			
 
				+    .can_execute = can_execute,
			
 
				+    .cpu_funcs = @{ cpu_func, NULL @},
			
 
				+    .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Note: the most generic variant should be provided first, as some schedulers are
			
 
				+not able to try the different variants.
			
 
				+
			
 
				+@node Task and Worker Profiling
			
 
				+@section Task and Worker Profiling
			
 
				+
			
 
				+A full example showing how to use the profiling API is available in
			
 
				+the StarPU sources in the directory @code{examples/profiling/}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &cl;
			
 
				+task->synchronous = 1;
			
 
				+/* We will destroy the task structure by hand so that we can
			
 
				+ * query the profiling info before the task is destroyed. */
			
 
				+task->destroy = 0;
			
 
				+
			
 
				+/* Submit and wait for completion (since synchronous was set to 1) */
			
 
				+starpu_task_submit(task);
			
 
				+
			
 
				+/* The task is finished, get profiling information */
			
 
				+struct starpu_task_profiling_info *info = task->profiling_info;
			
 
				+
			
 
				+/* How much time did it take before the task started ? */
			
 
				+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
			
 
				+
			
 
				+/* How long was the task execution ? */
			
 
				+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+
			
 
				+/* We don't need the task structure anymore */
			
 
				+starpu_task_destroy(task);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Display the occupancy of all workers during the test */
			
 
				+int worker;
			
 
				+for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+@{
			
 
				+        struct starpu_worker_profiling_info worker_info;
			
 
				+        int ret = starpu_worker_get_profiling_info(worker, &worker_info);
			
 
				+        STARPU_ASSERT(!ret);
			
 
				+
			
 
				+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
			
 
				+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
			
 
				+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
			
 
				+
			
 
				+        float executing_ratio = 100.0*executing_time/total_time;
			
 
				+        float sleeping_ratio = 100.0*sleeping_time/total_time;
			
 
				+
			
 
				+        char workername[128];
			
 
				+        starpu_worker_get_name(worker, workername, 128);
			
 
				+        fprintf(stderr, "Worker %s:\n", workername);
			
 
				+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
			
 
				+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,
			
 
				+                executing_ratio);
			
 
				+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
			
 
				+                sleeping_ratio);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Partitioning Data
			
 
				+@section Partitioning Data
			
 
				+
			
 
				+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int vector[NX];
			
 
				+starpu_data_handle_t handle;
			
 
				+
			
 
				+/* Declare data to StarPU */
			
 
				+starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				+
			
 
				+/* Partition the vector in PARTS sub-vectors */
			
 
				+starpu_filter f =
			
 
				+@{
			
 
				+    .filter_func = starpu_block_filter_func_vector,
			
 
				+    .nchildren = PARTS
			
 
				+@};
			
 
				+starpu_data_partition(handle, &f);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The task submission then uses @code{starpu_data_get_sub_data} to retrive the
			
 
				+sub-handles to be passed as tasks parameters.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Submit a task on each sub-vector */
			
 
				+for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
			
 
				+    /* Get subdata number i (there is only 1 dimension) */
			
 
				+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->handles[0] = sub_handle;
			
 
				+    task->cl = &cl;
			
 
				+    task->synchronous = 1;
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+    starpu_task_submit(task);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Partitioning can be applied several times, see
			
 
				+@code{examples/basic_examples/mult.c} and @code{examples/filters/}.
			
 
				+
			
 
				+Wherever the whole piece of data is already available, the partitioning will
			
 
				+be done in-place, i.e. without allocating new buffers but just using pointers
			
 
				+inside the existing copy. This is particularly important to be aware of when
			
 
				+using OpenCL, where the kernel parameters are not pointers, but handles. The
			
 
				+kernel thus needs to be also passed the offset within the OpenCL buffer:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void opencl_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
			
 
				+
			
 
				+    ...
			
 
				+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
			
 
				+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
			
 
				+    ...
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+And the kernel has to shift from the pointer passed by the OpenCL driver:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+__kernel void opencl_kernel(__global int *vector, unsigned offset)
			
 
				+@{
			
 
				+    block = (__global void *)block + offset;
			
 
				+    ...
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Performance model example
			
 
				+@section Performance model example
			
 
				+
			
 
				+To achieve good scheduling, StarPU scheduling policies need to be able to
			
 
				+estimate in advance the duration of a task. This is done by giving to codelets
			
 
				+a performance model, by defining a @code{starpu_perfmodel} structure and
			
 
				+providing its address in the @code{model} field of the @code{struct starpu_codelet}
			
 
				+structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
			
 
				+are mandatory, to give a name to the model, and the type of the model, since
			
 
				+there are several kinds of performance models.
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
			
 
				+given set of data input/output sizes, the performance will always be about the
			
 
				+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
			
 
				+and just a bit less true on CPUs (~=1% error). This also assumes that there are
			
 
				+few different sets of data input/output sizes. StarPU will then keep record of
			
 
				+the average time of previous executions on the various processing units, and use
			
 
				+it as an estimation. History is done per task size, by using a hash of the input
			
 
				+and ouput sizes as an index.
			
 
				+It will also save it in @code{~/.starpu/sampling/codelets}
			
 
				+for further executions, and can be observed by using the
			
 
				+@code{starpu_perfmodel_display} command, or drawn by using
			
 
				+the @code{starpu_perfmodel_plot}.  The models are indexed by machine name. To
			
 
				+share the models between machines (e.g. for a homogeneous cluster), use
			
 
				+@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
			
 
				+
			
 
				+The following is a small code example.
			
 
				+
			
 
				+If e.g. the code is recompiled with other compilation options, or several
			
 
				+variants of the code are used, the symbol string should be changed to reflect
			
 
				+that, in order to recalibrate a new model from zero. The symbol string can even
			
 
				+be constructed dynamically at execution time, as long as this is done before
			
 
				+submitting any task using it.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static struct starpu_perfmodel mult_perf_model = @{
			
 
				+    .type = STARPU_HISTORY_BASED,
			
 
				+    .symbol = "mult_perf_model"
			
 
				+@};
			
 
				+
			
 
				+struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = @{ cpu_mult, NULL @},
			
 
				+    .nbuffers = 3,
			
 
				+    .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
			
 
				+    /* for the scheduling policy to be able to use performance models */
			
 
				+    .model = &mult_perf_model
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@item
			
 
				+Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
			
 
				+model type). This still assumes performance regularity, but can work
			
 
				+with various data input sizes, by applying regression over observed
			
 
				+execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				+form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
 
				+STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
			
 
				+@code{tests/perfmodels/regression_based.c} uses a regression-based performance
			
 
				+model for the @code{memset} operation. Of course, the application has to issue
			
 
				+tasks with varying size so that the regression can be computed. StarPU will not
			
 
				+trust the regression unless there is at least 10% difference between the minimum
			
 
				+and maximum observed input size. For non-linear regression, since computing it
			
 
				+is quite expensive, it is only done at termination of the application. This
			
 
				+means that the first execution uses history-based performance model to perform
			
 
				+scheduling.
			
 
				+
			
 
				+@item
			
 
				+Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
			
 
				+see for instance
			
 
				+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
			
 
				+
			
 
				+@item
			
 
				+Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
			
 
				+@code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers to
			
 
				+functions which return the expected duration of the task in micro-seconds, one
			
 
				+per architecture.
			
 
				+
			
 
				+@end itemize
			
 
				+
			
 
				+For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
			
 
				+the total size of task data (both input and output) is used as an index by
			
 
				+default. The @code{size_base} field of @code{struct starpu_perfmodel} however
			
 
				+permits the application to override that, when for instance some of the data
			
 
				+do not matter for task cost (e.g. mere reference table), or when using sparse
			
 
				+structures (in which case it is the number of non-zeros which matter), or when
			
 
				+there is some hidden parameter such as the number of iterations, etc.
			
 
				+
			
 
				+How to use schedulers which can benefit from such performance model is explained
			
 
				+in @ref{Task scheduling policy}.
			
 
				+
			
 
				+The same can be done for task power consumption estimation, by setting the
			
 
				+@code{power_model} field the same way as the @code{model} field. Note: for
			
 
				+now, the application has to give to the power consumption performance model
			
 
				+a name which is different from the execution time performance model.
			
 
				+
			
 
				+The application can request time estimations from the StarPU performance
			
 
				+models by filling a task structure as usual without actually submitting
			
 
				+it. The data handles can be created by calling @code{starpu_data_register}
			
 
				+functions with a @code{NULL} pointer (and need to be unregistered as usual)
			
 
				+and the desired data sizes. The @code{starpu_task_expected_length} and
			
 
				+@code{starpu_task_expected_power} functions can then be called to get an
			
 
				+estimation of the task duration on a given arch. @code{starpu_task_destroy}
			
 
				+needs to be called to destroy the dummy task afterwards. See
			
 
				+@code{tests/perfmodels/regression_based.c} for an example.
			
 
				+
			
 
				+@node Theoretical lower bound on execution time
			
 
				+@section Theoretical lower bound on execution time
			
 
				+
			
 
				+For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
			
 
				+bound for the execution time of a whole set of tasks. See for
			
 
				+instance @code{examples/lu/lu_example.c}: before submitting tasks,
			
 
				+call @code{starpu_bound_start}, and after complete execution, call
			
 
				+@code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
			
 
				+@code{starpu_bound_print_mps} can then be used to output a Linear Programming
			
 
				+problem corresponding to the schedule of your tasks. Run it through
			
 
				+@code{lp_solve} or any other linear programming solver, and that will give you a
			
 
				+lower bound for the total execution time of your tasks. If StarPU was compiled
			
 
				+with the glpk library installed, @code{starpu_bound_compute} can be used to
			
 
				+solve it immediately and get the optimized minimum, in ms. Its @code{integer}
			
 
				+parameter allows to decide whether integer resolution should be computed
			
 
				+and returned too.
			
 
				+
			
 
				+The @code{deps} parameter tells StarPU whether to take tasks and implicit data
			
 
				+dependencies into account. It must be understood that the linear programming
			
 
				+problem size is quadratic with the number of tasks and thus the time to solve it
			
 
				+will be very long, it could be minutes for just a few dozen tasks. You should
			
 
				+probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
			
 
				+problem to MPS format and then use a better solver, @code{glpsol} might be
			
 
				+better than @code{lp_solve} for instance (the @code{--pcost} option may be
			
 
				+useful), but sometimes doesn't manage to converge. @code{cbc} might look
			
 
				+slower, but it is parallel. Be sure to try at least all the @code{-B} options
			
 
				+of @code{lp_solve}. For instance, we often just use
			
 
				+@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
			
 
				+the @code{-gr} option can also be quite useful.
			
 
				+
			
 
				+Setting @code{deps} to 0 will only take into account the actual computations
			
 
				+on processing units. It however still properly takes into account the varying
			
 
				+performances of kernels and processing units, which is quite more accurate than
			
 
				+just comparing StarPU performances with the fastest of the kernels being used.
			
 
				+
			
 
				+The @code{prio} parameter tells StarPU whether to simulate taking into account
			
 
				+the priorities as the StarPU scheduler would, i.e. schedule prioritized
			
 
				+tasks before less prioritized tasks, to check to which extend this results
			
 
				+to a less optimal solution. This increases even more computation time.
			
 
				+
			
 
				+Note that for simplicity, all this however doesn't take into account data
			
 
				+transfers, which are assumed to be completely overlapped.
			
 
				+
			
 
				+@node Insert Task Utility
			
 
				+@section Insert Task Utility
			
 
				+
			
 
				+StarPU provides the wrapper function @code{starpu_insert_task} to ease
			
 
				+the creation and submission of tasks.
			
 
				+
			
 
				+@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
			
 
				+Create and submit a task corresponding to @var{cl} with the following
			
 
				+arguments.  The argument list must be zero-terminated.
			
 
				+
			
 
				+The arguments following the codelets can be of the following types:
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				+@item
			
 
				+the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				+@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				+@code{STARPU_PRIORITY}, followed by the appropriated objects as
			
 
				+defined below.
			
 
				+@end itemize
			
 
				+
			
 
				+Parameters to be passed to the codelet implementation are defined
			
 
				+through the type @code{STARPU_VALUE}. The function
			
 
				+@code{starpu_codelet_unpack_args} must be called within the codelet
			
 
				+implementation to retrieve them.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@defmac STARPU_VALUE
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to a constant value and the size of the constant
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_CALLBACK
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to a callback function
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_CALLBACK_ARG
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to be given as an argument to the callback
			
 
				+function
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac  STARPU_CALLBACK_WITH_ARG
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by two pointers: one to a callback function, and the other to
			
 
				+be given as an argument to the callback function; this is equivalent
			
 
				+to using both @code{STARPU_CALLBACK} and
			
 
				+@code{STARPU_CALLBACK_WITH_ARG}
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_PRIORITY
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a integer defining a priority level
			
 
				+@end defmac
			
 
				+
			
 
				+@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
			
 
				+Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
			
 
				+given to a codelet and later unpacked with the function
			
 
				+@code{starpu_codelet_unpack_args} defined below.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
			
 
				+Retrieve the arguments of type @code{STARPU_VALUE} associated to a
			
 
				+task automatically created using the function
			
 
				+@code{starpu_insert_task} defined above.
			
 
				+@end deftypefun
			
 
				+
			
 
				+Here the implementation of the codelet:
			
 
				+
			
 
				+@smallexample
			
 
				+void func_cpu(void *descr[], void *_args)
			
 
				+@{
			
 
				+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        int ifactor;
			
 
				+        float ffactor;
			
 
				+
			
 
				+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
			
 
				+        *x0 = *x0 * ifactor;
			
 
				+        *x1 = *x1 * ffactor;
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet mycodelet = @{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = @{ func_cpu, NULL @},
			
 
				+        .nbuffers = 2,
			
 
				+        .modes = @{ STARPU_RW, STARPU_RW @}
			
 
				+@};
			
 
				+@end smallexample
			
 
				+
			
 
				+And the call to the @code{starpu_insert_task} wrapper:
			
 
				+
			
 
				+@smallexample
			
 
				+starpu_insert_task(&mycodelet,
			
 
				+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				+                   0);
			
 
				+@end smallexample
			
 
				+
			
 
				+The call to @code{starpu_insert_task} is equivalent to the following
			
 
				+code:
			
 
				+
			
 
				+@smallexample
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &mycodelet;
			
 
				+task->handles[0] = data_handles[0];
			
 
				+task->handles[1] = data_handles[1];
			
 
				+char *arg_buffer;
			
 
				+size_t arg_buffer_size;
			
 
				+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
			
 
				+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                    0);
			
 
				+task->cl_arg = arg_buffer;
			
 
				+task->cl_arg_size = arg_buffer_size;
			
 
				+int ret = starpu_task_submit(task);
			
 
				+@end smallexample
			
 
				+
			
 
				+If some part of the task insertion depends on the value of some computation,
			
 
				+the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
			
 
				+instance, assuming that the index variable @code{i} was registered as handle
			
 
				+@code{i_handle}:
			
 
				+
			
 
				+@smallexample
			
 
				+/* Compute which portion we will work on, e.g. pivot */
			
 
				+starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
			
 
				+
			
 
				+/* And submit the corresponding task */
			
 
				+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
			
 
				+@end smallexample
			
 
				+
			
 
				+The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
			
 
				+acquiring data @code{i} for the main application, and will execute the code
			
 
				+given as third parameter when it is acquired. In other words, as soon as the
			
 
				+value of @code{i} computed by the @code{which_index} codelet can be read, the
			
 
				+portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
			
 
				+be executed, and is allowed to read from @code{i} to use it e.g. as an
			
 
				+index. Note that this macro is only avaible when compiling StarPU with
			
 
				+the compiler @code{gcc}.
			
 
				+
			
 
				+@node Parallel Tasks
			
 
				+@section Parallel Tasks
			
 
				+
			
 
				+StarPU can leverage existing parallel computation libraries by the means of
			
 
				+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
			
 
				+(called a parallel or combined worker) at the same time, by using an existing
			
 
				+parallel CPU implementation of the computation to be achieved. This can also be
			
 
				+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
 
				+work collectively on a single task, the completion time of tasks on CPUs become
			
 
				+comparable to the completion time on GPUs, thus relieving from granularity
			
 
				+discrepancy concerns.
			
 
				+
			
 
				+Two modes of execution exist to accomodate with existing usages.
			
 
				+
			
 
				+@subsection Fork-mode parallel tasks
			
 
				+
			
 
				+In the Fork mode, StarPU will call the codelet function on one
			
 
				+of the CPUs of the combined worker. The codelet function can use
			
 
				+@code{starpu_combined_worker_get_size()} to get the number of threads it is
			
 
				+allowed to start to achieve the computation. The CPU binding mask is already
			
 
				+enforced, so that threads created by the function will inherit the mask, and
			
 
				+thus execute where StarPU expected. For instance, using OpenMP (full source is
			
 
				+available in @code{examples/openmp/vector_scal.c}):
			
 
				+
			
 
				+@example
			
 
				+void scal_cpu_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = _args;
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+@{
			
 
				+    .modes = @{ STARPU_RW @},
			
 
				+    .where = STARPU_CPU,
			
 
				+    .type = STARPU_FORKJOIN,
			
 
				+    .max_parallelism = INT_MAX,
			
 
				+    .cpu_funcs = @{scal_cpu_func, NULL@},
			
 
				+    .nbuffers = 1,
			
 
				+@};
			
 
				+@end example
			
 
				+
			
 
				+Other examples include for instance calling a BLAS parallel CPU implementation
			
 
				+(see @code{examples/mult/xgemm.c}).
			
 
				+
			
 
				+@subsection SPMD-mode parallel tasks
			
 
				+
			
 
				+In the SPMD mode, StarPU will call the codelet function on
			
 
				+each CPU of the combined worker. The codelet function can use
			
 
				+@code{starpu_combined_worker_get_size()} to get the total number of CPUs
			
 
				+involved in the combined worker, and thus the number of calls that are made in
			
 
				+parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
			
 
				+the rank of the current CPU within the combined worker. For instance:
			
 
				+
			
 
				+@example
			
 
				+static void func(void *buffers[], void *args)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = _args;
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+    /* Compute slice to compute */
			
 
				+    unsigned m = starpu_combined_worker_get_size();
			
 
				+    unsigned j = starpu_combined_worker_get_rank();
			
 
				+    unsigned slice = (n+m-1)/m;
			
 
				+
			
 
				+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+@{
			
 
				+    .modes = @{ STARPU_RW @},
			
 
				+    .where = STARP_CPU,
			
 
				+    .type = STARPU_SPMD,
			
 
				+    .max_parallelism = INT_MAX,
			
 
				+    .cpu_funcs = @{ func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+@}
			
 
				+@end example
			
 
				+
			
 
				+Of course, this trivial example will not really benefit from parallel task
			
 
				+execution, and was only meant to be simple to understand.  The benefit comes
			
 
				+when the computation to be done is so that threads have to e.g. exchange
			
 
				+intermediate results, or write to the data in a complex but safe way in the same
			
 
				+buffer.
			
 
				+
			
 
				+@subsection Parallel tasks performance
			
 
				+
			
 
				+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
			
 
				+be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
			
 
				+(parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed also
			
 
				+try to execute tasks with several CPUs. It will automatically try the various
			
 
				+available combined worker sizes and thus be able to avoid choosing a large
			
 
				+combined worker if the codelet does not actually scale so much.
			
 
				+
			
 
				+@subsection Combined worker sizes
			
 
				+
			
 
				+By default, StarPU creates combined workers according to the architecture
			
 
				+structure as detected by hwloc. It means that for each object of the hwloc
			
 
				+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
			
 
				+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
			
 
				+without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				+intermediate sizes.
			
 
				+
			
 
				+@subsection Concurrent parallel tasks
			
 
				+
			
 
				+Unfortunately, many environments and librairies do not support concurrent
			
 
				+calls.
			
 
				+
			
 
				+For instance, most OpenMP implementations (including the main ones) do not
			
 
				+support concurrent @code{pragma omp parallel} statements without nesting them in
			
 
				+another @code{pragma omp parallel} statement, but StarPU does not yet support
			
 
				+creating its CPU workers by using such pragma.
			
 
				+
			
 
				+Other parallel libraries are also not safe when being invoked concurrently
			
 
				+from different threads, due to the use of global variables in their sequential
			
 
				+sections for instance.
			
 
				+
			
 
				+The solution is then to use only one combined worker at a time.  This can be
			
 
				+done by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}
			
 
				+structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environment
			
 
				+variable to 1. StarPU will then run only one parallel task at a time.
			
 
				+
			
 
				+@node Debugging
			
 
				+@section Debugging
			
 
				+
			
 
				+StarPU provides several tools to help debugging aplications. Execution traces
			
 
				+can be generated and displayed graphically, see @ref{Generating traces}. Some
			
 
				+gdb helpers are also provided to show the whole StarPU state:
			
 
				+
			
 
				+@smallexample
			
 
				+(gdb) source tools/gdbinit
			
 
				+(gdb) help starpu
			
 
				+@end smallexample
			
 
				+
			
 
				+@node The multiformat interface
			
 
				+@section The multiformat interface
			
 
				+It may be interesting to represent the same piece of data using two different
			
 
				+data structures: one that would only be used on CPUs, and one that would only
			
 
				+be used on GPUs. This can be done by using the multiformat interface. StarPU
			
 
				+will be able to convert data from one data structure to the other when needed.
			
 
				+Note that the heft scheduler is the only one optimized for this interface. The
			
 
				+user must provide StarPU with conversion codelets:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#define NX 1024
			
 
				+struct point array_of_structs[NX];
			
 
				+starpu_data_handle_t handle;
			
 
				+
			
 
				+/*
			
 
				+ * The conversion of a piece of data is itself a task, though it is created,
			
 
				+ * submitted and destroyed by StarPU internals and not by the user. Therefore,
			
 
				+ * we have to define two codelets.
			
 
				+ * Note that for now the conversion from the CPU format to the GPU format has to
			
 
				+ * be executed on the GPU, and the conversion from the GPU to the CPU has to be
			
 
				+ * executed on the CPU.
			
 
				+ */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet cpu_to_opencl_cl = @{
			
 
				+    .where = STARPU_OPENCL,
			
 
				+    .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+
			
 
				+void opencl_to_cpu_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet opencl_to_cpu_cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+#endif
			
 
				+
			
 
				+struct starpu_multiformat_data_interface_ops format_ops = @{
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    .opencl_elemsize = 2 * sizeof(float),
			
 
				+    .cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+    .opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				+    .cpu_elemsize = 2 * sizeof(float),
			
 
				+    ...
			
 
				+@};
			
 
				+starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Kernels can be written almost as for any other interface. Note that
			
 
				+STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
			
 
				+must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
			
 
				+STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
			
 
				+kind of kernel.
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static void
			
 
				+multiformat_scal_cpu_func(void *buffers[], void *args)
			
 
				+@{
			
 
				+    struct point *aos;
			
 
				+    unsigned int n;
			
 
				+
			
 
				+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+    ...
			
 
				+@}
			
 
				+
			
 
				+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    unsigned int n;
			
 
				+    struct struct_of_arrays *soa;
			
 
				+
			
 
				+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
			
 
				+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+
			
 
				+    ...
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+A full example may be found in @code{examples/basic_examples/multiformat.c}.
			
 
				+
			
 
				+@node On-GPU rendering
			
 
				+@section On-GPU rendering
			
 
				+
			
 
				+Graphical-oriented applications need to draw the result of their computations,
			
 
				+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
			
 
				+interoperability permit to let CUDA directly work on the OpenGL buffers, making
			
 
				+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
			
 
				+renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
			
 
				+be given the CUDA pointer at registration, for instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				+                break;
			
 
				+
			
 
				+cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				+
			
 
				+starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				+
			
 
				+starpu_data_unregister(handle);
			
 
				+
			
 
				+cudaSetDevice(starpu_worker_get_devid(workerid));
			
 
				+cudaGraphicsUnmapResources(1, &resource, 0);
			
 
				+
			
 
				+/* Now display it */
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node More examples
			
 
				+@section More examples
			
 
				+
			
 
				+
			
 
				+More examples are available in the StarPU sources in the @code{examples/}
			
 
				+directory. Simple examples include:
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{incrementer/}:
			
 
				+    Trivial incrementation test.
			
 
				+@item @code{basic_examples/}:
			
 
				+        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
			
 
				+        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
			
 
				+        product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
			
 
				+        interface, an example using the variable data interface, and an example
			
 
				+        using different formats on CPUs and GPUs.
			
 
				+@item @code{matvecmult/}:
			
 
				+    OpenCL example from NVidia, adapted to StarPU.
			
 
				+@item @code{axpy/}:
			
 
				+    AXPY CUBLAS operation adapted to StarPU.
			
 
				+@item @code{fortran/}:
			
 
				+    Example of Fortran bindings.
			
 
				+@end table
			
 
				+
			
 
				+More advanced examples include:
			
 
				+
			
 
				+@table @asis
			
 
				+@item @code{filters/}:
			
 
				+    Examples using filters, as shown in @ref{Partitioning Data}.
			
 
				+@item @code{lu/}:
			
 
				+    LU matrix factorization, see for instance @code{xlu_implicit.c}
			
 
				+@item @code{cholesky/}:
			
 
				+    Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
			
 
				+@end table
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -0,0 +1,989 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Compiling and linking options::  
			
 
				+* Hello World::                 Submitting Tasks
			
 
				+* Vector Scaling Using the C Extension::  
			
 
				+* Vector Scaling Using StarPu's API::  
			
 
				+* Vector Scaling on an Hybrid CPU/GPU Machine::  Handling Heterogeneous Architectures
			
 
				+@end menu
			
 
				+
			
 
				+@node Compiling and linking options
			
 
				+@section Compiling and linking options
			
 
				+
			
 
				+Let's suppose StarPU has been installed in the directory
			
 
				+@code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
			
 
				+the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
			
 
				+necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
			
 
				+libraries at runtime.
			
 
				+
			
 
				+@example
			
 
				+% PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+% LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
			
 
				+@end example
			
 
				+
			
 
				+The Makefile could for instance contain the following lines to define which
			
 
				+options must be given to the compiler and to the linker:
			
 
				+
			
 
				+@cartouche
			
 
				+@example
			
 
				+CFLAGS          +=      $$(pkg-config --cflags starpu-1.0)
			
 
				+LDFLAGS         +=      $$(pkg-config --libs starpu-1.0)
			
 
				+@end example
			
 
				+@end cartouche
			
 
				+
			
 
				+Also pass the @code{--static} option if the application is to be linked statically.
			
 
				+
			
 
				+@node Hello World
			
 
				+@section Hello World
			
 
				+
			
 
				+This section shows how to implement a simple program that submits a task
			
 
				+to StarPU. You can either use the StarPU C extension (@pxref{C
			
 
				+Extensions}) or directly use the StarPU's API.
			
 
				+
			
 
				+@menu
			
 
				+* Hello World using the C Extension::  
			
 
				+* Hello World using StarPU's API::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Hello World using the C Extension
			
 
				+@subsection Hello World using the C Extension
			
 
				+
			
 
				+Writing a task is both simpler and less error-prone when using the C
			
 
				+extensions implemented by StarPU's GCC plug-in (@pxref{C Extensions}).
			
 
				+In a nutshell, all it takes is to declare a task, declare and define its
			
 
				+implementations (for CPU, OpenCL, and/or CUDA), and invoke the task like
			
 
				+a regular C function.  The example below defines @code{my_task}, which
			
 
				+has a single implementation for CPU:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Task declaration.  */
			
 
				+static void my_task (int x) __attribute__ ((task));
			
 
				+
			
 
				+/* Declaration of the CPU implementation of `my_task'.  */
			
 
				+static void my_task_cpu (int x) __attribute__ ((task_implementation ("cpu", my_task)));
			
 
				+
			
 
				+/* Definition of said CPU implementation.  */
			
 
				+static void my_task_cpu (int x)
			
 
				+@{
			
 
				+  printf ("Hello, world!  With x = %d\n", x);
			
 
				+@}
			
 
				+
			
 
				+int main ()
			
 
				+@{
			
 
				+  /* Initialize StarPU.  */
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  /* Do an asynchronous call to `my_task'.  */
			
 
				+  my_task (42);
			
 
				+
			
 
				+  /* Wait for the call to complete.  */
			
 
				+#pragma starpu wait
			
 
				+
			
 
				+  /* Terminate.  */
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return 0;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+The code can then be compiled and linked with GCC and the
			
 
				+@code{-fplugin} flag:
			
 
				+
			
 
				+@example
			
 
				+$ gcc hello-starpu.c \
			
 
				+    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
			
 
				+    `pkg-config starpu-1.0 --libs`
			
 
				+@end example
			
 
				+
			
 
				+As can be seen above, basic use the C extensions allows programmers to
			
 
				+use StarPU tasks while essentially annotating ``regular'' C code.
			
 
				+
			
 
				+@node Hello World using StarPU's API
			
 
				+@subsection Hello World using StarPU's API
			
 
				+
			
 
				+The remainder of this section shows how to achieve the same result using
			
 
				+StarPU's standard C API.
			
 
				+
			
 
				+@menu
			
 
				+* Required Headers::            
			
 
				+* Defining a Codelet::          
			
 
				+* Submitting a Task::           
			
 
				+* Execution of Hello World::    
			
 
				+@end menu
			
 
				+
			
 
				+@node Required Headers
			
 
				+@subsubsection Required Headers
			
 
				+
			
 
				+The @code{starpu.h} header should be included in any code using StarPU.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
 
				+@node Defining a Codelet
			
 
				+@subsubsection Defining a Codelet
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+struct params @{
			
 
				+    int i;
			
 
				+    float f;
			
 
				+@};
			
 
				+void cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    struct params *params = cl_arg;
			
 
				+
			
 
				+    printf("Hello world (params = @{%i, %f@} )\n", params->i, params->f);
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet cl =
			
 
				+@{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = @{ cpu_func, NULL @},
			
 
				+    .nbuffers = 0
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+A codelet is a structure that represents a computational kernel. Such a codelet
			
 
				+may contain an implementation of the same kernel on different architectures
			
 
				+(e.g. CUDA, Cell's SPU, x86, ...).
			
 
				+
			
 
				+The @code{nbuffers} field specifies the number of data buffers that are
			
 
				+manipulated by the codelet: here the codelet does not access or modify any data
			
 
				+that is controlled by our data management library. Note that the argument
			
 
				+passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
			
 
				+structure) does not count as a buffer since it is not managed by our data
			
 
				+management library, but just contain trivial parameters.
			
 
				+
			
 
				+@c TODO need a crossref to the proper description of "where" see bla for more ...
			
 
				+We create a codelet which may only be executed on the CPUs. The @code{where}
			
 
				+field is a bitmask that defines where the codelet may be executed. Here, the
			
 
				+@code{STARPU_CPU} value means that only CPUs can execute this codelet
			
 
				+(@pxref{Codelets and Tasks} for more details on this field). Note that
			
 
				+the @code{where} field is optional, when unset its value is
			
 
				+automatically set based on the availability of the different
			
 
				+@code{XXX_funcs} fields.
			
 
				+When a CPU core executes a codelet, it calls the @code{cpu_func} function,
			
 
				+which @emph{must} have the following prototype:
			
 
				+
			
 
				+@code{void (*cpu_func)(void *buffers[], void *cl_arg);}
			
 
				+
			
 
				+In this example, we can ignore the first argument of this function which gives a
			
 
				+description of the input and output buffers (e.g. the size and the location of
			
 
				+the matrices) since there is none.
			
 
				+The second argument is a pointer to a buffer passed as an
			
 
				+argument to the codelet by the means of the @code{cl_arg} field of the
			
 
				+@code{starpu_task} structure.
			
 
				+
			
 
				+@c TODO rewrite so that it is a little clearer ?
			
 
				+Be aware that this may be a pointer to a
			
 
				+@emph{copy} of the actual buffer, and not the pointer given by the programmer:
			
 
				+if the codelet modifies this buffer, there is no guarantee that the initial
			
 
				+buffer will be modified as well: this for instance implies that the buffer
			
 
				+cannot be used as a synchronization medium. If synchronization is needed, data
			
 
				+has to be registered to StarPU, see @ref{Vector Scaling Using StarPu's API}.
			
 
				+
			
 
				+@node Submitting a Task
			
 
				+@subsubsection Submitting a Task
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void callback_func(void *callback_arg)
			
 
				+@{
			
 
				+    printf("Callback function (arg %x)\n", callback_arg);
			
 
				+@}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    /* @b{initialize StarPU} */
			
 
				+    starpu_init(NULL);
			
 
				+
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->cl = &cl; /* @b{Pointer to the codelet defined above} */
			
 
				+
			
 
				+    struct params params = @{ 1, 2.0f @};
			
 
				+    task->cl_arg = &params;
			
 
				+    task->cl_arg_size = sizeof(params);
			
 
				+
			
 
				+    task->callback_func = callback_func;
			
 
				+    task->callback_arg = 0x42;
			
 
				+
			
 
				+    /* @b{starpu_task_submit will be a blocking call} */
			
 
				+    task->synchronous = 1;
			
 
				+
			
 
				+    /* @b{submit the task to StarPU} */
			
 
				+    starpu_task_submit(task);
			
 
				+
			
 
				+    /* @b{terminate StarPU} */
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    return 0;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
			
 
				+@code{NULL} argument specifies that we use default configuration. Tasks cannot
			
 
				+be submitted after the termination of StarPU by a call to
			
 
				+@code{starpu_shutdown}.
			
 
				+
			
 
				+In the example above, a task structure is allocated by a call to
			
 
				+@code{starpu_task_create}. This function only allocates and fills the
			
 
				+corresponding structure with the default settings (@pxref{Codelets and
			
 
				+Tasks, starpu_task_create}), but it does not submit the task to StarPU.
			
 
				+
			
 
				+@c not really clear ;)
			
 
				+The @code{cl} field is a pointer to the codelet which the task will
			
 
				+execute: in other words, the codelet structure describes which computational
			
 
				+kernel should be offloaded on the different architectures, and the task
			
 
				+structure is a wrapper containing a codelet and the piece of data on which the
			
 
				+codelet should operate.
			
 
				+
			
 
				+The optional @code{cl_arg} field is a pointer to a buffer (of size
			
 
				+@code{cl_arg_size}) with some parameters for the kernel
			
 
				+described by the codelet. For instance, if a codelet implements a computational
			
 
				+kernel that multiplies its input vector by a constant, the constant could be
			
 
				+specified by the means of this buffer, instead of registering it as a StarPU
			
 
				+data. It must however be noted that StarPU avoids making copy whenever possible
			
 
				+and rather passes the pointer as such, so the buffer which is pointed at must
			
 
				+kept allocated until the task terminates, and if several tasks are submitted
			
 
				+with various parameters, each of them must be given a pointer to their own
			
 
				+buffer.
			
 
				+
			
 
				+Once a task has been executed, an optional callback function is be called.
			
 
				+While the computational kernel could be offloaded on various architectures, the
			
 
				+callback function is always executed on a CPU. The @code{callback_arg}
			
 
				+pointer is passed as an argument of the callback. The prototype of a callback
			
 
				+function must be:
			
 
				+
			
 
				+@code{void (*callback_function)(void *);}
			
 
				+
			
 
				+If the @code{synchronous} field is non-zero, task submission will be
			
 
				+synchronous: the @code{starpu_task_submit} function will not return until the
			
 
				+task was executed. Note that the @code{starpu_shutdown} method does not
			
 
				+guarantee that asynchronous tasks have been executed before it returns,
			
 
				+@code{starpu_task_wait_for_all} can be used to that effect, or data can be
			
 
				+unregistered (@code{starpu_data_unregister(vector_handle);}), which will
			
 
				+implicitly wait for all the tasks scheduled to work on it, unless explicitly
			
 
				+disabled thanks to @code{starpu_data_set_default_sequential_consistency_flag} or
			
 
				+@code{starpu_data_set_sequential_consistency_flag}.
			
 
				+
			
 
				+@node Execution of Hello World
			
 
				+@subsubsection Execution of Hello World
			
 
				+
			
 
				+@smallexample
			
 
				+% make hello_world
			
 
				+cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0) hello_world.c -o hello_world
			
 
				+% ./hello_world
			
 
				+Hello world (params = @{1, 2.000000@} )
			
 
				+Callback function (arg 42)
			
 
				+@end smallexample
			
 
				+
			
 
				+@node Vector Scaling Using the C Extension
			
 
				+@section Vector Scaling Using the C Extension
			
 
				+
			
 
				+The previous example has shown how to submit tasks. In this section,
			
 
				+we show how StarPU tasks can manipulate data. The version of this
			
 
				+example using StarPU's API is given in the next sections.
			
 
				+
			
 
				+
			
 
				+@menu
			
 
				+* Adding an OpenCL Task Implementation::  
			
 
				+* Adding a CUDA Task Implementation::  
			
 
				+@end menu
			
 
				+
			
 
				+The simplest way to get started writing StarPU programs is using the C
			
 
				+language extensions provided by the GCC plug-in (@pxref{C Extensions}).
			
 
				+These extensions map directly to StarPU's main concepts: tasks, task
			
 
				+implementations for CPU, OpenCL, or CUDA, and registered data buffers.
			
 
				+
			
 
				+The example below is a vector-scaling program, that multiplies elements
			
 
				+of a vector by a given factor@footnote{The complete example, and
			
 
				+additional examples, is available in the @file{gcc-plugin/examples}
			
 
				+directory of the StarPU distribution.}.  For comparison, the standard C
			
 
				+version that uses StarPU's standard C programming interface is given in
			
 
				+the next section (@pxref{Vector Scaling Using StarPu's API, standard C
			
 
				+version of the example}).
			
 
				+
			
 
				+First of all, the vector-scaling task and its simple CPU implementation
			
 
				+has to be defined:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Declare the `vector_scal' task.  */
			
 
				+
			
 
				+static void vector_scal (size_t size, float vector[size],
			
 
				+                         float factor)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+/* Declare and define the standard CPU implementation.  */
			
 
				+
			
 
				+static void vector_scal_cpu (size_t size, float vector[size],
			
 
				+                             float factor)
			
 
				+  __attribute__ ((task_implementation ("cpu", vector_scal)));
			
 
				+
			
 
				+static void
			
 
				+vector_scal_cpu (size_t size, float vector[size], float factor)
			
 
				+@{
			
 
				+  size_t i;
			
 
				+  for (i = 0; i < size; i++)
			
 
				+    vector[i] *= factor;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Next, the body of the program, which uses the task defined above, can be
			
 
				+implemented:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int
			
 
				+main (void)
			
 
				+@{
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+#define NX     0x100000
			
 
				+#define FACTOR 3.14
			
 
				+
			
 
				+  @{
			
 
				+    float vector[NX] __attribute__ ((heap_allocated));
			
 
				+
			
 
				+#pragma starpu register vector
			
 
				+
			
 
				+    size_t i;
			
 
				+    for (i = 0; i < NX; i++)
			
 
				+      vector[i] = (float) i;
			
 
				+
			
 
				+    vector_scal (NX, vector, FACTOR);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+  @} /* VECTOR is automatically freed here.  */
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return valid ? EXIT_SUCCESS : EXIT_FAILURE;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+The @code{main} function above does several things:
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+It initializes StarPU.
			
 
				+
			
 
				+@item
			
 
				+It allocates @var{vector} in the heap; it will automatically be freed
			
 
				+when its scope is left.  Alternatively, good old @code{malloc} and
			
 
				+@code{free} could have been used, but they are more error-prone and
			
 
				+require more typing.
			
 
				+
			
 
				+@item
			
 
				+It @dfn{registers} the memory pointed to by @var{vector}.  Eventually,
			
 
				+when OpenCL or CUDA task implementations are added, this will allow
			
 
				+StarPU to transfer that memory region between GPUs and the main memory.
			
 
				+Removing this @code{pragma} is an error.
			
 
				+
			
 
				+@item
			
 
				+It invokes the @code{vector_scal} task.  The invocation looks the same
			
 
				+as a standard C function call.  However, it is an @dfn{asynchronous
			
 
				+invocation}, meaning that the actual call is performed in parallel with
			
 
				+the caller's continuation.
			
 
				+
			
 
				+@item
			
 
				+It @dfn{waits} for the termination of the @code{vector_scal}
			
 
				+asynchronous call.
			
 
				+
			
 
				+@item
			
 
				+Finally, StarPU is shut down.
			
 
				+
			
 
				+@end itemize
			
 
				+
			
 
				+The program can be compiled and linked with GCC and the @code{-fplugin}
			
 
				+flag:
			
 
				+
			
 
				+@example
			
 
				+$ gcc hello-starpu.c \
			
 
				+    -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` \
			
 
				+    `pkg-config starpu-1.0 --libs`
			
 
				+@end example
			
 
				+
			
 
				+And voil@`a!
			
 
				+
			
 
				+@node Adding an OpenCL Task Implementation
			
 
				+@subsection Adding an OpenCL Task Implementation
			
 
				+
			
 
				+Now, this is all fine and great, but you certainly want to take
			
 
				+advantage of these newfangled GPUs that your lab just bought, don't you?
			
 
				+
			
 
				+So, let's add an OpenCL implementation of the @code{vector_scal} task.
			
 
				+We assume that the OpenCL kernel is available in a file,
			
 
				+@file{vector_scal_opencl_kernel.cl}, not shown here.  The OpenCL task
			
 
				+implementation is similar to that used with the standard C API
			
 
				+(@pxref{Definition of the OpenCL Kernel}).  It is declared and defined
			
 
				+in our C file like this:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Include StarPU's OpenCL integration.  */
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+/* The OpenCL programs, loaded from `main' (see below).  */
			
 
				+static struct starpu_opencl_program cl_programs;
			
 
				+
			
 
				+static void vector_scal_opencl (size_t size, float vector[size],
			
 
				+                                float factor)
			
 
				+  __attribute__ ((task_implementation ("opencl", vector_scal)));
			
 
				+
			
 
				+@c TODO This example will not work : size cannot be a size_t in clSetKernelArg, and global should not be 1. Do we want to document the ugly hach we use, though ?
			
 
				+static void
			
 
				+vector_scal_opencl (size_t size, float vector[size], float factor)
			
 
				+@{
			
 
				+  int id, devid, err;
			
 
				+  cl_kernel kernel;
			
 
				+  cl_command_queue queue;
			
 
				+  cl_event event;
			
 
				+
			
 
				+  /* VECTOR is GPU memory pointer, not a main memory pointer.  */
			
 
				+  cl_mem val = (cl_mem) vector;
			
 
				+
			
 
				+  id = starpu_worker_get_id ();
			
 
				+  devid = starpu_worker_get_devid (id);
			
 
				+
			
 
				+  /* Prepare to invoke the kernel.  In the future, this will be largely
			
 
				+     automated.  */
			
 
				+  err = starpu_opencl_load_kernel (&kernel, &queue, &cl_programs,
			
 
				+                                   "vector_mult_opencl", devid);
			
 
				+  if (err != CL_SUCCESS)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  err = clSetKernelArg (kernel, 0, sizeof (val), &val);
			
 
				+  err |= clSetKernelArg (kernel, 1, sizeof (size), &size);
			
 
				+  err |= clSetKernelArg (kernel, 2, sizeof (factor), &factor);
			
 
				+  if (err)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  size_t global = 1, local = 1;
			
 
				+  err = clEnqueueNDRangeKernel (queue, kernel, 1, NULL, &global,
			
 
				+                                &local, 0, NULL, &event);
			
 
				+  if (err != CL_SUCCESS)
			
 
				+    STARPU_OPENCL_REPORT_ERROR (err);
			
 
				+
			
 
				+  clFinish (queue);
			
 
				+  starpu_opencl_collect_stats (event);
			
 
				+  clReleaseEvent (event);
			
 
				+
			
 
				+  /* Done with KERNEL.  */
			
 
				+  starpu_opencl_release_kernel (kernel);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+The OpenCL kernel itself must be loaded from @code{main}, sometime after
			
 
				+the @code{initialize} pragma:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+  starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
			
 
				+                                       &cl_programs, "");
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+And that's it.  The @code{vector_scal} task now has an additional
			
 
				+implementation, for OpenCL, which StarPU's scheduler may choose to use
			
 
				+at run-time.  Unfortunately, the @code{vector_scal_opencl} above still
			
 
				+has to go through the common OpenCL boilerplate; in the future,
			
 
				+additional extensions will automate most of it.
			
 
				+
			
 
				+@node Adding a CUDA Task Implementation
			
 
				+@subsection Adding a CUDA Task Implementation
			
 
				+
			
 
				+Adding a CUDA implementation of the task is very similar, except that
			
 
				+the implementation itself is typically written in CUDA, and compiled
			
 
				+with @code{nvcc}.  Thus, the C file only needs to contain an external
			
 
				+declaration for the task implementation:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+extern void vector_scal_cuda (size_t size, float vector[size],
			
 
				+                              float factor)
			
 
				+  __attribute__ ((task_implementation ("cuda", vector_scal)));
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The actual implementation of the CUDA task goes into a separate
			
 
				+compilation unit, in a @file{.cu} file.  It is very close to the
			
 
				+implementation when using StarPU's standard C API (@pxref{Definition of
			
 
				+the CUDA Kernel}).
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* CUDA implementation of the `vector_scal' task, to be compiled
			
 
				+   with `nvcc'.  */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+static __global__ void
			
 
				+vector_mult_cuda (float *val, unsigned n, float factor)
			
 
				+@{
			
 
				+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+
			
 
				+  if (i < n)
			
 
				+    val[i] *= factor;
			
 
				+@}
			
 
				+
			
 
				+/* Definition of the task implementation declared in the C file.   */
			
 
				+extern "C" void
			
 
				+vector_scal_cuda (size_t size, float vector[], float factor)
			
 
				+@{
			
 
				+  unsigned threads_per_block = 64;
			
 
				+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
			
 
				+
			
 
				+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
			
 
				+    starpu_cuda_get_local_stream () >>> (vector, size, factor);
			
 
				+
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The complete source code, in the @file{gcc-plugin/examples/vector_scal}
			
 
				+directory of the StarPU distribution, also shows how an SSE-specialized
			
 
				+CPU task implementation can be added.
			
 
				+
			
 
				+For more details on the C extensions provided by StarPU's GCC plug-in,
			
 
				+@xref{C Extensions}.
			
 
				+
			
 
				+@node Vector Scaling Using StarPu's API
			
 
				+@section Vector Scaling Using StarPu's API
			
 
				+
			
 
				+This section shows how to achieve the same result as explained in the
			
 
				+previous section using StarPU's standard C API.
			
 
				+
			
 
				+The full source code for
			
 
				+this example is given in @ref{Full source code for the 'Scaling a
			
 
				+Vector' example}.
			
 
				+
			
 
				+@menu
			
 
				+* Source Code of Vector Scaling::  
			
 
				+* Execution of Vector Scaling::  Running the program
			
 
				+@end menu
			
 
				+
			
 
				+@node Source Code of Vector Scaling
			
 
				+@subsection Source Code of Vector Scaling
			
 
				+
			
 
				+Programmers can describe the data layout of their application so that StarPU is
			
 
				+responsible for enforcing data coherency and availability across the machine.
			
 
				+Instead of handling complex (and non-portable) mechanisms to perform data
			
 
				+movements, programmers only declare which piece of data is accessed and/or
			
 
				+modified by a task, and StarPU makes sure that when a computational kernel
			
 
				+starts somewhere (e.g. on a GPU), its data are available locally.
			
 
				+
			
 
				+Before submitting those tasks, the programmer first needs to declare the
			
 
				+different pieces of data to StarPU using the @code{starpu_*_data_register}
			
 
				+functions. To ease the development of applications for StarPU, it is possible
			
 
				+to describe multiple types of data layout. A type of data layout is called an
			
 
				+@b{interface}. There are different predefined interfaces available in StarPU:
			
 
				+here we will consider the @b{vector interface}.
			
 
				+
			
 
				+The following lines show how to declare an array of @code{NX} elements of type
			
 
				+@code{float} using the vector interface:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+float vector[NX];
			
 
				+
			
 
				+starpu_data_handle_t vector_handle;
			
 
				+starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
			
 
				+                            sizeof(vector[0]));
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The first argument, called the @b{data handle}, is an opaque pointer which
			
 
				+designates the array in StarPU. This is also the structure which is used to
			
 
				+describe which data is used by a task. The second argument is the node number
			
 
				+where the data originally resides. Here it is 0 since the @code{vector} array is in
			
 
				+the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
			
 
				+the number of elements in the vector and the size of each element.
			
 
				+The following shows how to construct a StarPU task that will manipulate the
			
 
				+vector and a constant factor.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+float factor = 3.14;
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+task->cl = &cl;                      /* @b{Pointer to the codelet defined below} */
			
 
				+task->handles[0] = vector_handle;    /* @b{First parameter of the codelet} */
			
 
				+task->cl_arg = &factor;
			
 
				+task->cl_arg_size = sizeof(factor);
			
 
				+task->synchronous = 1;
			
 
				+
			
 
				+starpu_task_submit(task);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Since the factor is a mere constant float value parameter,
			
 
				+it does not need a preliminary registration, and
			
 
				+can just be passed through the @code{cl_arg} pointer like in the previous
			
 
				+example.  The vector parameter is described by its handle.
			
 
				+There are two fields in each element of the @code{buffers} array.
			
 
				+@code{handle} is the handle of the data, and @code{mode} specifies how the
			
 
				+kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
			
 
				+write-only and @code{STARPU_RW} for read and write access).
			
 
				+
			
 
				+The definition of the codelet can be written as follows:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* CPU copy of the vector pointer */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = @{ scal_cpu_func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@};
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The first argument is an array that gives
			
 
				+a description of all the buffers passed in the @code{task->handles}@ array. The
			
 
				+size of this array is given by the @code{nbuffers} field of the codelet
			
 
				+structure. For the sake of genericity, this array contains pointers to the
			
 
				+different interfaces describing each buffer.  In the case of the @b{vector
			
 
				+interface}, the location of the vector (resp. its length) is accessible in the
			
 
				+@code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
			
 
				+read-write fashion, any modification will automatically affect future accesses
			
 
				+to this vector made by other tasks.
			
 
				+
			
 
				+The second argument of the @code{scal_cpu_func} function contains a pointer to the
			
 
				+parameters of the codelet (given in @code{task->cl_arg}), so that we read the
			
 
				+constant factor from this pointer.
			
 
				+
			
 
				+@node Execution of Vector Scaling
			
 
				+@subsection Execution of Vector Scaling
			
 
				+
			
 
				+@smallexample
			
 
				+% make vector_scal
			
 
				+cc $(pkg-config --cflags starpu-1.0)  $(pkg-config --libs starpu-1.0)  vector_scal.c   -o vector_scal
			
 
				+% ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+@node Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				+@section Vector Scaling on an Hybrid CPU/GPU Machine
			
 
				+
			
 
				+Contrary to the previous examples, the task submitted in this example may not
			
 
				+only be executed by the CPUs, but also by a CUDA device.
			
 
				+
			
 
				+@menu
			
 
				+* Definition of the CUDA Kernel::  
			
 
				+* Definition of the OpenCL Kernel::  
			
 
				+* Definition of the Main Code::  
			
 
				+* Execution of Hybrid Vector Scaling::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Definition of the CUDA Kernel
			
 
				+@subsection Definition of the CUDA Kernel
			
 
				+
			
 
				+The CUDA implementation can be written as follows. It needs to be compiled with
			
 
				+a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
			
 
				+that the vector pointer returned by STARPU_VECTOR_GET_PTR is here a pointer in GPU
			
 
				+memory, so that it can be passed as such to the @code{vector_mult_cuda} kernel
			
 
				+call.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+
			
 
				+static __global__ void vector_mult_cuda(float *val, unsigned n,
			
 
				+                                        float factor)
			
 
				+@{
			
 
				+    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+    if (i < n)
			
 
				+        val[i] *= factor;
			
 
				+@}
			
 
				+
			
 
				+extern "C" void scal_cuda_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    float *factor = (float *)_args;
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* CUDA copy of the vector pointer */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned threads_per_block = 64;
			
 
				+    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+@i{    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);}
			
 
				+
			
 
				+@i{    cudaStreamSynchronize(starpu_cuda_get_local_stream());}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Definition of the OpenCL Kernel
			
 
				+@subsection Definition of the OpenCL Kernel
			
 
				+
			
 
				+The OpenCL implementation can be written as follows. StarPU provides
			
 
				+tools to compile a OpenCL kernel stored in a file.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				+@{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx) @{
			
 
				+                val[i] *= factor;
			
 
				+        @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Contrary to CUDA and CPU, @code{STARPU_VECTOR_GET_DEV_HANDLE} has to be used,
			
 
				+which returns a @code{cl_mem} (which is not a device pointer, but an OpenCL
			
 
				+handle), which can be passed as such to the OpenCL kernel. The difference is
			
 
				+important when using partitioning, see @ref{Partitioning Data}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+@i{#include <starpu_opencl.h>}
			
 
				+
			
 
				+@i{extern struct starpu_opencl_program programs;}
			
 
				+
			
 
				+void scal_opencl_func(void *buffers[], void *_args)
			
 
				+@{
			
 
				+    float *factor = _args;
			
 
				+@i{    int id, devid, err;}
			
 
				+@i{    cl_kernel kernel;}
			
 
				+@i{    cl_command_queue queue;}
			
 
				+@i{    cl_event event;}
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    /* OpenCL copy of the vector pointer */
			
 
				+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				+
			
 
				+@i{    id = starpu_worker_get_id();}
			
 
				+@i{    devid = starpu_worker_get_devid(id);}
			
 
				+
			
 
				+@i{    err = starpu_opencl_load_kernel(&kernel, &queue, &programs,}
			
 
				+@i{                    "vector_mult_opencl", devid);   /* @b{Name of the codelet defined above} */}
			
 
				+@i{    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+
			
 
				+@i{    err = clSetKernelArg(kernel, 0, sizeof(val), &val);}
			
 
				+@i{    err |= clSetKernelArg(kernel, 1, sizeof(n), &n);}
			
 
				+@i{    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);}
			
 
				+@i{    if (err) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+
			
 
				+@i{    @{}
			
 
				+@i{        size_t global=n;}
			
 
				+@i{        size_t local=1;}
			
 
				+@i{        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);}
			
 
				+@i{        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
			
 
				+@i{    @}}
			
 
				+
			
 
				+@i{    clFinish(queue);}
			
 
				+@i{    starpu_opencl_collect_stats(event);}
			
 
				+@i{    clReleaseEvent(event);}
			
 
				+
			
 
				+@i{    starpu_opencl_release_kernel(kernel);}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
 
				+@node Definition of the Main Code
			
 
				+@subsection Definition of the Main Code
			
 
				+
			
 
				+The CPU implementation is the same as in the previous section.
			
 
				+
			
 
				+Here is the source of the main application. You can notice the value of the
			
 
				+field @code{where} for the codelet. We specify
			
 
				+@code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
			
 
				+can be executed either on a CPU or on a CUDA or an OpenCL device.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define NX 2048
			
 
				+
			
 
				+extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				+extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				+
			
 
				+/* @b{Definition of the codelet} */
			
 
				+static struct starpu_codelet cl = @{
			
 
				+    .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL; /* @b{It can be executed on a CPU,} */
			
 
				+                                     /* @b{on a CUDA device, or on an OpenCL device} */
			
 
				+    .cuda_funcs = @{ scal_cuda_func, NULL @},
			
 
				+    .cpu_funcs = @{ scal_cpu_func, NULL @},
			
 
				+    .opencl_funcs = @{ scal_opencl_func, NULL @},
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				+@}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+/* @b{The compiled version of the OpenCL program} */
			
 
				+struct starpu_opencl_program programs;
			
 
				+#endif
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    float *vector;
			
 
				+    int i, ret;
			
 
				+    float factor=3.0;
			
 
				+    struct starpu_task *task;
			
 
				+    starpu_data_handle_t vector_handle;
			
 
				+
			
 
				+    starpu_init(NULL);                            /* @b{Initialising StarPU} */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+    starpu_opencl_load_opencl_from_file(
			
 
				+            "examples/basic_examples/vector_scal_opencl_codelet.cl",
			
 
				+            &programs, NULL);
			
 
				+#endif
			
 
				+
			
 
				+    vector = malloc(NX*sizeof(vector[0]));
			
 
				+    assert(vector);
			
 
				+    for(i=0 ; i<NX ; i++) vector[i] = i;
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Registering data within StarPU} */
			
 
				+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				+                                NX, sizeof(vector[0]));
			
 
				+
			
 
				+    /* @b{Definition of the task} */
			
 
				+    task = starpu_task_create();
			
 
				+    task->cl = &cl;
			
 
				+    task->handles[0] = vector_handle;
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Submitting the task} */
			
 
				+    ret = starpu_task_submit(task);
			
 
				+    if (ret == -ENODEV) @{
			
 
				+            fprintf(stderr, "No worker may execute this task\n");
			
 
				+            return 1;
			
 
				+    @}
			
 
				+
			
 
				+@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
			
 
				+    /* @b{Waiting for its termination} */
			
 
				+    starpu_task_wait_for_all();
			
 
				+
			
 
				+    /* @b{Update the vector in RAM} */
			
 
				+    starpu_data_acquire(vector_handle, STARPU_R);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    /* @b{Access the data} */
			
 
				+    for(i=0 ; i<NX; i++) @{
			
 
				+      fprintf(stderr, "%f ", vector[i]);
			
 
				+    @}
			
 
				+    fprintf(stderr, "\n");
			
 
				+
			
 
				+    /* @b{Release the RAM view of the data before unregistering it and shutting down StarPU} */
			
 
				+    starpu_data_release(vector_handle);
			
 
				+    starpu_data_unregister(vector_handle);
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    return 0;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@node Execution of Hybrid Vector Scaling
			
 
				+@subsection Execution of Hybrid Vector Scaling
			
 
				+
			
 
				+The Makefile given at the beginning of the section must be extended to
			
 
				+give the rules to compile the CUDA source code. Note that the source
			
 
				+file of the OpenCL kernel does not need to be compiled now, it will
			
 
				+be compiled at run-time when calling the function
			
 
				+@code{starpu_opencl_load_opencl_from_file()} (@pxref{starpu_opencl_load_opencl_from_file}).
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+CFLAGS  += $(shell pkg-config --cflags starpu-1.0)
			
 
				+LDFLAGS += $(shell pkg-config --libs starpu-1.0)
			
 
				+CC       = gcc
			
 
				+
			
 
				+vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
			
 
				+
			
 
				+%.o: %.cu
			
 
				+       nvcc $(CFLAGS) $< -c $@
			
 
				+
			
 
				+clean:
			
 
				+       rm -f vector_scal *.o
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@smallexample
			
 
				+% make
			
 
				+@end smallexample
			
 
				+
			
 
				+and to execute it, with the default configuration:
			
 
				+
			
 
				+@smallexample
			
 
				+% ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+or for example, by disabling CPU devices:
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_NCPUS=0 ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
 
				+
			
 
				+or by disabling CUDA devices (which may permit to enable the use of OpenCL,
			
 
				+see @ref{Enabling OpenCL}):
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_NCUDA=0 ./vector_scal
			
 
				+0.000000 3.000000 6.000000 9.000000 12.000000
			
 
				+@end smallexample
			
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -0,0 +1,411 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@cindex C extensions
			
 
				+@cindex GCC plug-in
			
 
				+
			
 
				+When GCC plug-in support is available, StarPU builds a plug-in for the
			
 
				+GNU Compiler Collection (GCC), which defines extensions to languages of
			
 
				+the C family (C, C++, Objective-C) that make it easier to write StarPU
			
 
				+code@footnote{This feature is only available for GCC 4.5 and later.  It
			
 
				+can be disabled by configuring with @code{--disable-gcc-extensions}.}.
			
 
				+
			
 
				+Those extensions include syntactic sugar for defining
			
 
				+tasks and their implementations, invoking a task, and manipulating data
			
 
				+buffers.  Use of these extensions can be made conditional on the
			
 
				+availability of the plug-in, leading to valid C sequential code when the
			
 
				+plug-in is not used (@pxref{Conditional Extensions}).
			
 
				+
			
 
				+When StarPU has been installed with its GCC plug-in, programs that use
			
 
				+these extensions can be compiled this way:
			
 
				+
			
 
				+@example
			
 
				+$ gcc -c -fplugin=`pkg-config starpu-1.0 --variable=gccplugin` foo.c
			
 
				+@end example
			
 
				+
			
 
				+@noindent
			
 
				+When the plug-in is not available, the above @command{pkg-config}
			
 
				+command returns the empty string.
			
 
				+
			
 
				+In addition, the @code{-fplugin-arg-starpu-verbose} flag can be used to
			
 
				+obtain feedback from the compiler as it analyzes the C extensions used
			
 
				+in source files.
			
 
				+
			
 
				+This section describes the C extensions implemented by StarPU's GCC
			
 
				+plug-in.  It does not require detailed knowledge of the StarPU library.
			
 
				+
			
 
				+Note: as of StarPU @value{VERSION}, this is still an area under
			
 
				+development and subject to change.
			
 
				+
			
 
				+@menu
			
 
				+* Defining Tasks::              Defining StarPU tasks
			
 
				+* Synchronization and Other Pragmas:: Synchronization, and more.
			
 
				+* Registered Data Buffers::     Manipulating data buffers
			
 
				+* Conditional Extensions::      Using C extensions only when available
			
 
				+@end menu
			
 
				+
			
 
				+@node Defining Tasks
			
 
				+@section Defining Tasks
			
 
				+
			
 
				+@cindex task
			
 
				+@cindex task implementation
			
 
				+
			
 
				+The StarPU GCC plug-in views @dfn{tasks} as ``extended'' C functions:
			
 
				+
			
 
				+@enumerate
			
 
				+@item
			
 
				+tasks may have several implementations---e.g., one for CPUs, one written
			
 
				+in OpenCL, one written in CUDA;
			
 
				+@item
			
 
				+tasks may have several implementations of the same target---e.g.,
			
 
				+several CPU implementations;
			
 
				+@item
			
 
				+when a task is invoked, it may run in parallel, and StarPU is free to
			
 
				+choose any of its implementations.
			
 
				+@end enumerate
			
 
				+
			
 
				+Tasks and their implementations must be @emph{declared}.  These
			
 
				+declarations are annotated with @dfn{attributes} (@pxref{Attribute
			
 
				+Syntax, attributes in GNU C,, gcc, Using the GNU Compiler Collection
			
 
				+(GCC)}): the declaration of a task is a regular C function declaration
			
 
				+with an additional @code{task} attribute, and task implementations are
			
 
				+declared with a @code{task_implementation} attribute.
			
 
				+
			
 
				+The following function attributes are provided:
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item task
			
 
				+@cindex @code{task} attribute
			
 
				+Declare the given function as a StarPU task.  Its return type must be
			
 
				+@code{void}, and it must not be defined---instead, a definition will
			
 
				+automatically be provided by the compiler.
			
 
				+
			
 
				+Under the hood, declaring a task leads to the declaration of the
			
 
				+corresponding @code{codelet} (@pxref{Codelet and Tasks}).  If one or
			
 
				+more task implementations are declared in the same compilation unit,
			
 
				+then the codelet and the function itself are also defined; they inherit
			
 
				+the scope of the task.
			
 
				+
			
 
				+Scalar arguments to the task are passed by value and copied to the
			
 
				+target device if need be---technically, they are passed as the
			
 
				+@code{cl_arg} buffer (@pxref{Codelets and Tasks, @code{cl_arg}}).
			
 
				+
			
 
				+@cindex @code{output} type attribute
			
 
				+Pointer arguments are assumed to be registered data buffers---the
			
 
				+@code{buffers} argument of a task (@pxref{Codelets and Tasks,
			
 
				+@code{buffers}}); @code{const}-qualified pointer arguments are viewed as
			
 
				+read-only buffers (@code{STARPU_R}), and non-@code{const}-qualified
			
 
				+buffers are assumed to be used read-write (@code{STARPU_RW}).  In
			
 
				+addition, the @code{output} type attribute can be as a type qualifier
			
 
				+for output pointer or array parameters (@code{STARPU_W}).
			
 
				+
			
 
				+@item task_implementation (@var{target}, @var{task})
			
 
				+@cindex @code{task_implementation} attribute
			
 
				+Declare the given function as an implementation of @var{task} to run on
			
 
				+@var{target}.  @var{target} must be a string, currently one of
			
 
				+@code{"cpu"}, @code{"opencl"}, or @code{"cuda"}.
			
 
				+@c FIXME: Update when OpenCL support is ready.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+Here is an example:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+#define __output  __attribute__ ((output))
			
 
				+
			
 
				+static void matmul (const float *A, const float *B,
			
 
				+                    __output float *C,
			
 
				+                    size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+static void matmul_cpu (const float *A, const float *B,
			
 
				+                        __output float *C,
			
 
				+                        size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+matmul_cpu (const float *A, const float *B, __output float *C,
			
 
				+            size_t nx, size_t ny, size_t nz)
			
 
				+@{
			
 
				+  size_t i, j, k;
			
 
				+
			
 
				+  for (j = 0; j < ny; j++)
			
 
				+    for (i = 0; i < nx; i++)
			
 
				+      @{
			
 
				+        for (k = 0; k < nz; k++)
			
 
				+          C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				+      @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+A @code{matmult} task is defined; it has only one implementation,
			
 
				+@code{matmult_cpu}, which runs on the CPU.  Variables @var{A} and
			
 
				+@var{B} are input buffers, whereas @var{C} is considered an input/output
			
 
				+buffer.
			
 
				+
			
 
				+CUDA and OpenCL implementations can be declared in a similar way:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static void matmul_cuda (const float *A, const float *B, float *C,
			
 
				+                         size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cuda", matmul)));
			
 
				+
			
 
				+static void matmul_opencl (const float *A, const float *B, float *C,
			
 
				+                           size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+The CUDA and OpenCL implementations typically either invoke a kernel
			
 
				+written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
			
 
				+@pxref{OpenCL Kernel}), or call a library function that uses CUDA or
			
 
				+OpenCL under the hood, such as CUBLAS functions:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static void
			
 
				+matmul_cuda (const float *A, const float *B, float *C,
			
 
				+             size_t nx, size_t ny, size_t nz)
			
 
				+@{
			
 
				+  cublasSgemm ('n', 'n', nx, ny, nz,
			
 
				+               1.0f, A, 0, B, 0,
			
 
				+               0.0f, C, 0);
			
 
				+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+A task can be invoked like a regular C function:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+matmul (&A[i * zdim * bydim + k * bzdim * bydim],
			
 
				+        &B[k * xdim * bzdim + j * bxdim * bzdim],
			
 
				+        &C[i * xdim * bydim + j * bxdim * bydim],
			
 
				+        bxdim, bydim, bzdim);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@noindent
			
 
				+This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
			
 
				+implementation may run in parallel with the continuation of the caller.
			
 
				+
			
 
				+The next section describes how memory buffers must be handled in
			
 
				+StarPU-GCC code.  For a complete example, see the
			
 
				+@code{gcc-plugin/examples} directory of the source distribution, and
			
 
				+@ref{Vector Scaling Using the C Extension, the vector-scaling
			
 
				+example}.
			
 
				+
			
 
				+
			
 
				+@node Synchronization and Other Pragmas
			
 
				+@section Initialization, Termination, and Synchronization
			
 
				+
			
 
				+The following pragmas allow user code to control StarPU's life time and
			
 
				+to synchronize with tasks.
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item #pragma starpu initialize
			
 
				+Initialize StarPU.  This call is compulsory and is @emph{never} added
			
 
				+implicitly.  One of the reasons this has to be done explicitly is that
			
 
				+it provides greater control to user code over its resource usage.
			
 
				+
			
 
				+@item #pragma starpu shutdown
			
 
				+Shut down StarPU, giving it an opportunity to write profiling info to a
			
 
				+file on disk, for instance (@pxref{Off-line, off-line performance
			
 
				+feedback}).
			
 
				+
			
 
				+@item #pragma starpu wait
			
 
				+Wait for all task invocations to complete, as with
			
 
				+@code{starpu_wait_for_all} (@pxref{Codelets and Tasks,
			
 
				+starpu_wait_for_all}).
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node Registered Data Buffers
			
 
				+@section Registered Data Buffers
			
 
				+
			
 
				+Data buffers such as matrices and vectors that are to be passed to tasks
			
 
				+must be @dfn{registered}.  Registration allows StarPU to handle data
			
 
				+transfers among devices---e.g., transferring an input buffer from the
			
 
				+CPU's main memory to a task scheduled to run a GPU (@pxref{StarPU Data
			
 
				+Management Library}).
			
 
				+
			
 
				+The following pragmas are provided:
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item #pragma starpu register @var{ptr} [@var{size}]
			
 
				+Register @var{ptr} as a @var{size}-element buffer.  When @var{ptr} has
			
 
				+an array type whose size is known, @var{size} may be omitted.
			
 
				+
			
 
				+@item #pragma starpu unregister @var{ptr}
			
 
				+Unregister the previously-registered memory area pointed to by
			
 
				+@var{ptr}.  As a side-effect, @var{ptr} points to a valid copy in main
			
 
				+memory.
			
 
				+
			
 
				+@item #pragma starpu acquire @var{ptr}
			
 
				+Acquire in main memory an up-to-date copy of the previously-registered
			
 
				+memory area pointed to by @var{ptr}, for read-write access.
			
 
				+
			
 
				+@item #pragma starpu release @var{ptr}
			
 
				+Release the previously-register memory area pointed to by @var{ptr},
			
 
				+making it available to the tasks.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+Additionally, the @code{heap_allocated} variable attribute offers a
			
 
				+simple way to allocate storage for arrays on the heap:
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item heap_allocated
			
 
				+@cindex @code{heap_allocated} attribute
			
 
				+This attributes applies to local variables with an array type.  Its
			
 
				+effect is to automatically allocate the array's storage on
			
 
				+the heap, using @code{starpu_malloc} under the hood (@pxref{Basic Data
			
 
				+Library API, starpu_malloc}).  The heap-allocated array is automatically
			
 
				+freed when the variable's scope is left, as with
			
 
				+automatic variables.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@noindent
			
 
				+The following example illustrates use of the @code{heap_allocated}
			
 
				+attribute:
			
 
				+
			
 
				+@example
			
 
				+extern void cholesky(unsigned nblocks, unsigned size,
			
 
				+                    float mat[nblocks][nblocks][size])
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				+int
			
 
				+main (int argc, char *argv[])
			
 
				+@{
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  /* ... */
			
 
				+
			
 
				+  int nblocks, size;
			
 
				+  parse_args (&nblocks, &size);
			
 
				+
			
 
				+  /* Allocate an array of the required size on the heap,
			
 
				+     and register it.  */
			
 
				+
			
 
				+  @{
			
 
				+    float matrix[nblocks][nblocks][size]
			
 
				+      __attribute__ ((heap_allocated));
			
 
				+
			
 
				+#pragma starpu register matrix
			
 
				+
			
 
				+    cholesky (nblocks, size, matrix);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+#pragma starpu unregister matrix
			
 
				+
			
 
				+  @}   /* MATRIX is automatically freed here.  */
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return EXIT_SUCCESS;
			
 
				+@}
			
 
				+@end example
			
 
				+
			
 
				+@node Conditional Extensions
			
 
				+@section Using C Extensions Conditionally
			
 
				+
			
 
				+The C extensions described in this chapter are only available when GCC
			
 
				+and its StarPU plug-in are in use.  Yet, it is possible to make use of
			
 
				+these extensions when they are available---leading to hybrid CPU/GPU
			
 
				+code---and discard them when they are not available---leading to valid
			
 
				+sequential code.
			
 
				+
			
 
				+To that end, the GCC plug-in defines a C preprocessor macro when it is
			
 
				+being used:
			
 
				+
			
 
				+@defmac STARPU_GCC_PLUGIN
			
 
				+Defined for code being compiled with the StarPU GCC plug-in.  When
			
 
				+defined, this macro expands to an integer denoting the version of the
			
 
				+supported C extensions.
			
 
				+@end defmac
			
 
				+
			
 
				+The code below illustrates how to define a task and its implementations
			
 
				+in a way that allows it to be compiled without the GCC plug-in:
			
 
				+
			
 
				+@smallexample
			
 
				+/* The macros below abstract over the attributes specific to
			
 
				+   StarPU-GCC and the name of the CPU implementation.  */
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+# define __task  __attribute__ ((task))
			
 
				+# define CPU_TASK_IMPL(task)  task ## _cpu
			
 
				+#else
			
 
				+# define __task
			
 
				+# define CPU_TASK_IMPL(task)  task
			
 
				+#endif
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+static void matmul (const float *A, const float *B, float *C,
			
 
				+                    size_t nx, size_t ny, size_t nz) __task;
			
 
				+
			
 
				+#ifdef STARPU_GCC_PLUGIN
			
 
				+
			
 
				+static void matmul_cpu (const float *A, const float *B, float *C,
			
 
				+                        size_t nx, size_t ny, size_t nz)
			
 
				+  __attribute__ ((task_implementation ("cpu", matmul)));
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+CPU_TASK_IMPL (matmul) (const float *A, const float *B, float *C,
			
 
				+                        size_t nx, size_t ny, size_t nz)
			
 
				+@{
			
 
				+  /* Code of the CPU kernel here...  */
			
 
				+@}
			
 
				+
			
 
				+int
			
 
				+main (int argc, char *argv[])
			
 
				+@{
			
 
				+  /* The pragmas below are simply ignored when StarPU-GCC
			
 
				+     is not used.  */
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+  float A[123][42][7], B[123][42][7], C[123][42][7];
			
 
				+
			
 
				+#pragma starpu register A
			
 
				+#pragma starpu register B
			
 
				+#pragma starpu register C
			
 
				+
			
 
				+  /* When StarPU-GCC is used, the call below is asynchronous;
			
 
				+     otherwise, it is synchronous.  */
			
 
				+  matmul (A, B, C, 123, 42, 7);
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+  return EXIT_SUCCESS;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+
			
 
				+Note that attributes such as @code{task} are simply ignored by GCC when
			
 
				+the StarPU plug-in is not loaded, so the @code{__task} macro could be
			
 
				+omitted altogether.  However, @command{gcc -Wall} emits a warning for
			
 
				+unknown attributes, which can be inconvenient, and other compilers may
			
 
				+be unable to parse the attribute syntax.  Thus, using macros such as
			
 
				+@code{__task} above is recommended.
			
 
				+
			
 
				+@c Local Variables:
			
 
				+@c TeX-master: "../starpu.texi"
			
 
				+@c ispell-local-dictionary: "american"
			
 
				+@c End:
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -0,0 +1,403 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Compilation configuration::   
			
 
				+* Execution configuration through environment variables::  
			
 
				+@end menu
			
 
				+
			
 
				+@node Compilation configuration
			
 
				+@section Compilation configuration
			
 
				+
			
 
				+The following arguments can be given to the @code{configure} script.
			
 
				+
			
 
				+@menu
			
 
				+* Common configuration::        
			
 
				+* Configuring workers::         
			
 
				+* Advanced configuration::      
			
 
				+@end menu
			
 
				+
			
 
				+@node Common configuration
			
 
				+@subsection Common configuration
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item --enable-debug
			
 
				+Enable debugging messages.
			
 
				+
			
 
				+@item --enable-fast
			
 
				+Disable assertion checks, which saves computation time.
			
 
				+
			
 
				+@item --enable-verbose
			
 
				+Increase the verbosity of the debugging messages.  This can be disabled
			
 
				+at runtime by setting the environment variable @code{STARPU_SILENT} to
			
 
				+any value.
			
 
				+
			
 
				+@smallexample
			
 
				+% STARPU_SILENT=1 ./vector_scal
			
 
				+@end smallexample
			
 
				+
			
 
				+@item --enable-coverage
			
 
				+Enable flags for the @code{gcov} coverage tool.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node Configuring workers
			
 
				+@subsection Configuring workers
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item --enable-maxcpus=@var{count}
			
 
				+Use at most @var{count} CPU cores.  This information is then
			
 
				+available as the @code{STARPU_MAXCPUS} macro.
			
 
				+
			
 
				+@item --disable-cpu
			
 
				+Disable the use of CPUs of the machine. Only GPUs etc. will be used.
			
 
				+
			
 
				+@item --enable-maxcudadev=@var{count}
			
 
				+Use at most @var{count} CUDA devices.  This information is then
			
 
				+available as the @code{STARPU_MAXCUDADEVS} macro.
			
 
				+
			
 
				+@item --disable-cuda
			
 
				+Disable the use of CUDA, even if a valid CUDA installation was detected.
			
 
				+
			
 
				+@item --with-cuda-dir=@var{prefix}
			
 
				+Search for CUDA under @var{prefix}, which should notably contain
			
 
				+@file{include/cuda.h}.
			
 
				+
			
 
				+@item --with-cuda-include-dir=@var{dir}
			
 
				+Search for CUDA headers under @var{dir}, which should
			
 
				+notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
			
 
				+value given to @code{--with-cuda-dir}.
			
 
				+
			
 
				+@item --with-cuda-lib-dir=@var{dir}
			
 
				+Search for CUDA libraries under @var{dir}, which should notably contain
			
 
				+the CUDA shared libraries---e.g., @file{libcuda.so}.  This defaults to
			
 
				+@code{/lib} appended to the value given to @code{--with-cuda-dir}.
			
 
				+
			
 
				+@item --disable-cuda-memcpy-peer
			
 
				+Explicitly disable peer transfers when using CUDA 4.0.
			
 
				+
			
 
				+@item --enable-maxopencldev=@var{count}
			
 
				+Use at most @var{count} OpenCL devices.  This information is then
			
 
				+available as the @code{STARPU_MAXOPENCLDEVS} macro.
			
 
				+
			
 
				+@item --disable-opencl
			
 
				+Disable the use of OpenCL, even if the SDK is detected.
			
 
				+
			
 
				+@item --with-opencl-dir=@var{prefix}
			
 
				+Search for an OpenCL implementation under @var{prefix}, which should
			
 
				+notably contain @file{include/CL/cl.h} (or @file{include/OpenCL/cl.h} on
			
 
				+Mac OS).
			
 
				+
			
 
				+@item --with-opencl-include-dir=@var{dir}
			
 
				+Search for OpenCL headers under @var{dir}, which should notably contain
			
 
				+@file{CL/cl.h} (or @file{OpenCL/cl.h} on Mac OS).  This defaults to
			
 
				+@code{/include} appended to the value given to @code{--with-opencl-dir}.
			
 
				+
			
 
				+@item --with-opencl-lib-dir=@var{dir}
			
 
				+Search for an OpenCL library under @var{dir}, which should notably
			
 
				+contain the OpenCL shared libraries---e.g. @file{libOpenCL.so}. This defaults to
			
 
				+@code{/lib} appended to the value given to @code{--with-opencl-dir}.
			
 
				+
			
 
				+@item --enable-gordon
			
 
				+Enable the use of the Gordon runtime for Cell SPUs.
			
 
				+@c TODO: rather default to enabled when detected
			
 
				+
			
 
				+@item --with-gordon-dir=@var{prefix}
			
 
				+Search for the Gordon SDK under @var{prefix}.
			
 
				+
			
 
				+@item --enable-maximplementations=@var{count}
			
 
				+Allow for at most @var{count} codelet implementations for the same
			
 
				+target device.  This information is then available as the
			
 
				+@code{STARPU_MAXIMPLEMENTATIONS} macro.
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@node Advanced configuration
			
 
				+@subsection Advanced configuration
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item --enable-perf-debug
			
 
				+Enable performance debugging through gprof.
			
 
				+
			
 
				+@item --enable-model-debug
			
 
				+Enable performance model debugging.
			
 
				+
			
 
				+@item --enable-stats
			
 
				+@c see ../../src/datawizard/datastats.c
			
 
				+Enable gathering of memory transfer statistics.
			
 
				+
			
 
				+@item --enable-maxbuffers
			
 
				+Define the maximum number of buffers that tasks will be able to take
			
 
				+as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
			
 
				+
			
 
				+@item --enable-allocation-cache
			
 
				+Enable the use of a data allocation cache to avoid the cost of it with
			
 
				+CUDA. Still experimental.
			
 
				+
			
 
				+@item --enable-opengl-render
			
 
				+Enable the use of OpenGL for the rendering of some examples.
			
 
				+@c TODO: rather default to enabled when detected
			
 
				+
			
 
				+@item --enable-blas-lib
			
 
				+Specify the blas library to be used by some of the examples. The
			
 
				+library has to be 'atlas' or 'goto'.
			
 
				+
			
 
				+@item --disable-starpufft
			
 
				+Disable the build of libstarpufft, even if fftw or cuFFT is available.
			
 
				+
			
 
				+@item --with-magma=@var{prefix}
			
 
				+Search for MAGMA under @var{prefix}.  @var{prefix} should notably
			
 
				+contain @file{include/magmablas.h}.
			
 
				+
			
 
				+@item --with-fxt=@var{prefix}
			
 
				+Search for FxT under @var{prefix}.
			
 
				+@url{http://savannah.nongnu.org/projects/fkt, FxT} is used to generate
			
 
				+traces of scheduling events, which can then be rendered them using ViTE
			
 
				+(@pxref{Off-line, off-line performance feedback}).  @var{prefix} should
			
 
				+notably contain @code{include/fxt/fxt.h}.
			
 
				+
			
 
				+@item --with-perf-model-dir=@var{dir}
			
 
				+Store performance models under @var{dir}, instead of the current user's
			
 
				+home.
			
 
				+
			
 
				+@item --with-mpicc=@var{path}
			
 
				+Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				+(@pxref{StarPU MPI support}).
			
 
				+
			
 
				+@item --with-goto-dir=@var{prefix}
			
 
				+Search for GotoBLAS under @var{prefix}.
			
 
				+
			
 
				+@item --with-atlas-dir=@var{prefix}
			
 
				+Search for ATLAS under @var{prefix}, which should notably contain
			
 
				+@file{include/cblas.h}.
			
 
				+
			
 
				+@item --with-mkl-cflags=@var{cflags}
			
 
				+Use @var{cflags} to compile code that uses the MKL library.
			
 
				+
			
 
				+@item --with-mkl-ldflags=@var{ldflags}
			
 
				+Use @var{ldflags} when linking code that uses the MKL library.  Note
			
 
				+that the
			
 
				+@url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
			
 
				+MKL website} provides a script to determine the linking flags.
			
 
				+
			
 
				+@item --disable-gcc-extensions
			
 
				+Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
			
 
				+enabled when the GCC compiler provides a plug-in support.
			
 
				+
			
 
				+@item --disable-socl
			
 
				+Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
			
 
				+default, it is enabled when an OpenCL implementation is found.
			
 
				+
			
 
				+@item --disable-starpu-top
			
 
				+Disable the StarPU-Top interface (@pxref{starpu-top}).  By default, it
			
 
				+is enabled when the required dependencies are found.
			
 
				+
			
 
				+@end table
			
 
				+@node Execution configuration through environment variables
			
 
				+@section Execution configuration through environment variables
			
 
				+
			
 
				+@menu
			
 
				+* Workers::                     Configuring workers
			
 
				+* Scheduling::                  Configuring the Scheduling engine
			
 
				+* Misc::                        Miscellaneous and debug
			
 
				+@end menu
			
 
				+
			
 
				+Note: the values given in @code{starpu_conf} structure passed when
			
 
				+calling @code{starpu_init} will override the values of the environment
			
 
				+variables.
			
 
				+
			
 
				+@node Workers
			
 
				+@subsection Configuring workers
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_NCPUS::                Number of CPU workers
			
 
				+* STARPU_NCUDA::                Number of CUDA workers
			
 
				+* STARPU_NOPENCL::              Number of OpenCL workers
			
 
				+* STARPU_NGORDON::              Number of SPU workers (Cell)
			
 
				+* STARPU_WORKERS_NOBIND::       Do not bind workers
			
 
				+* STARPU_WORKERS_CPUID::        Bind workers to specific CPUs
			
 
				+* STARPU_WORKERS_CUDAID::       Select specific CUDA devices
			
 
				+* STARPU_WORKERS_OPENCLID::     Select specific OpenCL devices
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_NCPUS
			
 
				+@subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
			
 
				+
			
 
				+Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
			
 
				+more CPU workers than there are physical CPUs, and that some CPUs are used to control
			
 
				+the accelerators.
			
 
				+
			
 
				+@node STARPU_NCUDA
			
 
				+@subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
			
 
				+
			
 
				+Specify the number of CUDA devices that StarPU can use. If
			
 
				+@code{STARPU_NCUDA} is lower than the number of physical devices, it is
			
 
				+possible to select which CUDA devices should be used by the means of the
			
 
				+@code{STARPU_WORKERS_CUDAID} environment variable. By default, StarPU will
			
 
				+create as many CUDA workers as there are CUDA devices.
			
 
				+
			
 
				+@node STARPU_NOPENCL
			
 
				+@subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
			
 
				+
			
 
				+OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
			
 
				+
			
 
				+@node STARPU_NGORDON
			
 
				+@subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
			
 
				+
			
 
				+Specify the number of SPUs that StarPU can use.
			
 
				+
			
 
				+@node STARPU_WORKERS_NOBIND
			
 
				+@subsubsection @code{STARPU_WORKERS_NOBIND} -- Do not bind workers to specific CPUs
			
 
				+
			
 
				+Setting it to non-zero will prevent StarPU from binding its threads to
			
 
				+CPUs. This is for instance useful when running the testsuite in parallel.
			
 
				+
			
 
				+@node STARPU_WORKERS_CPUID
			
 
				+@subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
			
 
				+
			
 
				+Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
			
 
				+specifies on which logical CPU the different workers should be
			
 
				+bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
			
 
				+worker will be bound to logical CPU #0, the second CPU worker will be bound to
			
 
				+logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
			
 
				+determined by the OS, or provided by the @code{hwloc} library in case it is
			
 
				+available.
			
 
				+
			
 
				+Note that the first workers correspond to the CUDA workers, then come the
			
 
				+OpenCL and the SPU, and finally the CPU workers. For example if
			
 
				+we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
			
 
				+and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
			
 
				+by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
			
 
				+the logical CPUs #1 and #3 will be used by the CPU workers.
			
 
				+
			
 
				+If the number of workers is larger than the array given in
			
 
				+@code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
			
 
				+round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
			
 
				+third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_bindid} flag of the
			
 
				+@code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+
			
 
				+@node STARPU_WORKERS_CUDAID
			
 
				+@subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
			
 
				+
			
 
				+Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
			
 
				+possible to select which CUDA devices should be used by StarPU. On a machine
			
 
				+equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
			
 
				+@code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
			
 
				+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
			
 
				+the one reported by CUDA).
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_cuda_gpuid} flag of
			
 
				+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+
			
 
				+@node STARPU_WORKERS_OPENCLID
			
 
				+@subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
			
 
				+
			
 
				+OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
			
 
				+
			
 
				+This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
			
 
				+the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				+
			
 
				+@node Scheduling
			
 
				+@subsection Configuring the Scheduling engine
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_SCHED::                Scheduling policy
			
 
				+* STARPU_CALIBRATE::            Calibrate performance models
			
 
				+* STARPU_PREFETCH::             Use data prefetch
			
 
				+* STARPU_SCHED_ALPHA::          Computation factor
			
 
				+* STARPU_SCHED_BETA::           Communication factor
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SCHED
			
 
				+@subsubsection @code{STARPU_SCHED} -- Scheduling policy
			
 
				+
			
 
				+Choose between the different scheduling policies proposed by StarPU: work
			
 
				+random, stealing, greedy, with performance models, etc.
			
 
				+
			
 
				+Use @code{STARPU_SCHED=help} to get the list of available schedulers.
			
 
				+
			
 
				+@node STARPU_CALIBRATE
			
 
				+@subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
			
 
				+
			
 
				+If this variable is set to 1, the performance models are calibrated during
			
 
				+the execution. If it is set to 2, the previous values are dropped to restart
			
 
				+calibration from scratch. Setting this variable to 0 disable calibration, this
			
 
				+is the default behaviour.
			
 
				+
			
 
				+Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
			
 
				+
			
 
				+@node STARPU_PREFETCH
			
 
				+@subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
			
 
				+
			
 
				+This variable indicates whether data prefetching should be enabled (0 means
			
 
				+that it is disabled). If prefetching is enabled, when a task is scheduled to be
			
 
				+executed e.g. on a GPU, StarPU will request an asynchronous transfer in
			
 
				+advance, so that data is already present on the GPU when the task starts. As a
			
 
				+result, computation and data transfers are overlapped.
			
 
				+Note that prefetching is enabled by default in StarPU.
			
 
				+
			
 
				+@node STARPU_SCHED_ALPHA
			
 
				+@subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
			
 
				+
			
 
				+To estimate the cost of a task StarPU takes into account the estimated
			
 
				+computation time (obtained thanks to performance models). The alpha factor is
			
 
				+the coefficient to be applied to it before adding it to the communication part.
			
 
				+
			
 
				+@node STARPU_SCHED_BETA
			
 
				+@subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
			
 
				+
			
 
				+To estimate the cost of a task StarPU takes into account the estimated
			
 
				+data transfer time (obtained thanks to performance models). The beta factor is
			
 
				+the coefficient to be applied to it before adding it to the computation part.
			
 
				+
			
 
				+@node Misc
			
 
				+@subsection Miscellaneous and debug
			
 
				+
			
 
				+@menu
			
 
				+* STARPU_SILENT::               Disable verbose mode
			
 
				+* STARPU_LOGFILENAME::          Select debug file name
			
 
				+* STARPU_FXT_PREFIX::           FxT trace location
			
 
				+* STARPU_LIMIT_GPU_MEM::        Restrict memory size on the GPUs
			
 
				+* STARPU_GENERATE_TRACE::       Generate a Paje trace when StarPU is shut down
			
 
				+@end menu
			
 
				+
			
 
				+@node STARPU_SILENT
			
 
				+@subsubsection @code{STARPU_SILENT} -- Disable verbose mode
			
 
				+
			
 
				+This variable allows to disable verbose mode at runtime when StarPU
			
 
				+has been configured with the option @code{--enable-verbose}.
			
 
				+
			
 
				+@node STARPU_LOGFILENAME
			
 
				+@subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
			
 
				+
			
 
				+This variable specifies in which file the debugging output should be saved to.
			
 
				+
			
 
				+@node STARPU_FXT_PREFIX
			
 
				+@subsubsection @code{STARPU_FXT_PREFIX} -- FxT trace location
			
 
				+
			
 
				+This variable specifies in which directory to save the trace generated if FxT is enabled. It needs to have a trailing '/' character.
			
 
				+
			
 
				+@node STARPU_LIMIT_GPU_MEM
			
 
				+@subsubsection @code{STARPU_LIMIT_GPU_MEM} -- Restrict memory size on the GPUs
			
 
				+
			
 
				+This variable specifies the maximum number of megabytes that should be
			
 
				+available to the application on each GPUs. In case this value is smaller than
			
 
				+the size of the memory of a GPU, StarPU pre-allocates a buffer to waste memory
			
 
				+on the device. This variable is intended to be used for experimental purposes
			
 
				+as it emulates devices that have a limited amount of memory.
			
 
				+
			
 
				+@node STARPU_GENERATE_TRACE
			
 
				+@subsubsection @code{STARPU_GENERATE_TRACE} -- Generate a Paje trace when StarPU is shut down
			
 
				+
			
 
				+When set to 1, this variable indicates that StarPU should automatically
			
 
				+generate a Paje trace when starpu_shutdown is called.
			
--- a/doc/chapters/fdl-1.3.texi
+++ b/doc/chapters/fdl-1.3.texi
@@ -0,0 +1,508 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c The GNU Free Documentation License.
			
 
				+@center Version 1.3, 3 November 2008
			
 
				+
			
 
				+@c This file is intended to be included within another document,
			
 
				+@c hence no sectioning command or @node.
			
 
				+
			
 
				+@display
			
 
				+Copyright @copyright{} 2000, 2001, 2002, 2007, 2008 Free Software Foundation, Inc.
			
 
				+@uref{http://fsf.org/}
			
 
				+
			
 
				+Everyone is permitted to copy and distribute verbatim copies
			
 
				+of this license document, but changing it is not allowed.
			
 
				+@end display
			
 
				+
			
 
				+@enumerate 0
			
 
				+@item
			
 
				+PREAMBLE
			
 
				+
			
 
				+The purpose of this License is to make a manual, textbook, or other
			
 
				+functional and useful document @dfn{free} in the sense of freedom: to
			
 
				+assure everyone the effective freedom to copy and redistribute it,
			
 
				+with or without modifying it, either commercially or noncommercially.
			
 
				+Secondarily, this License preserves for the author and publisher a way
			
 
				+to get credit for their work, while not being considered responsible
			
 
				+for modifications made by others.
			
 
				+
			
 
				+This License is a kind of ``copyleft'', which means that derivative
			
 
				+works of the document must themselves be free in the same sense.  It
			
 
				+complements the GNU General Public License, which is a copyleft
			
 
				+license designed for free software.
			
 
				+
			
 
				+We have designed this License in order to use it for manuals for free
			
 
				+software, because free software needs free documentation: a free
			
 
				+program should come with manuals providing the same freedoms that the
			
 
				+software does.  But this License is not limited to software manuals;
			
 
				+it can be used for any textual work, regardless of subject matter or
			
 
				+whether it is published as a printed book.  We recommend this License
			
 
				+principally for works whose purpose is instruction or reference.
			
 
				+
			
 
				+@item
			
 
				+APPLICABILITY AND DEFINITIONS
			
 
				+
			
 
				+This License applies to any manual or other work, in any medium, that
			
 
				+contains a notice placed by the copyright holder saying it can be
			
 
				+distributed under the terms of this License.  Such a notice grants a
			
 
				+world-wide, royalty-free license, unlimited in duration, to use that
			
 
				+work under the conditions stated herein.  The ``Document'', below,
			
 
				+refers to any such manual or work.  Any member of the public is a
			
 
				+licensee, and is addressed as ``you''.  You accept the license if you
			
 
				+copy, modify or distribute the work in a way requiring permission
			
 
				+under copyright law.
			
 
				+
			
 
				+A ``Modified Version'' of the Document means any work containing the
			
 
				+Document or a portion of it, either copied verbatim, or with
			
 
				+modifications and/or translated into another language.
			
 
				+
			
 
				+A ``Secondary Section'' is a named appendix or a front-matter section
			
 
				+of the Document that deals exclusively with the relationship of the
			
 
				+publishers or authors of the Document to the Document's overall
			
 
				+subject (or to related matters) and contains nothing that could fall
			
 
				+directly within that overall subject.  (Thus, if the Document is in
			
 
				+part a textbook of mathematics, a Secondary Section may not explain
			
 
				+any mathematics.)  The relationship could be a matter of historical
			
 
				+connection with the subject or with related matters, or of legal,
			
 
				+commercial, philosophical, ethical or political position regarding
			
 
				+them.
			
 
				+
			
 
				+The ``Invariant Sections'' are certain Secondary Sections whose titles
			
 
				+are designated, as being those of Invariant Sections, in the notice
			
 
				+that says that the Document is released under this License.  If a
			
 
				+section does not fit the above definition of Secondary then it is not
			
 
				+allowed to be designated as Invariant.  The Document may contain zero
			
 
				+Invariant Sections.  If the Document does not identify any Invariant
			
 
				+Sections then there are none.
			
 
				+
			
 
				+The ``Cover Texts'' are certain short passages of text that are listed,
			
 
				+as Front-Cover Texts or Back-Cover Texts, in the notice that says that
			
 
				+the Document is released under this License.  A Front-Cover Text may
			
 
				+be at most 5 words, and a Back-Cover Text may be at most 25 words.
			
 
				+
			
 
				+A ``Transparent'' copy of the Document means a machine-readable copy,
			
 
				+represented in a format whose specification is available to the
			
 
				+general public, that is suitable for revising the document
			
 
				+straightforwardly with generic text editors or (for images composed of
			
 
				+pixels) generic paint programs or (for drawings) some widely available
			
 
				+drawing editor, and that is suitable for input to text formatters or
			
 
				+for automatic translation to a variety of formats suitable for input
			
 
				+to text formatters.  A copy made in an otherwise Transparent file
			
 
				+format whose markup, or absence of markup, has been arranged to thwart
			
 
				+or discourage subsequent modification by readers is not Transparent.
			
 
				+An image format is not Transparent if used for any substantial amount
			
 
				+of text.  A copy that is not ``Transparent'' is called ``Opaque''.
			
 
				+
			
 
				+Examples of suitable formats for Transparent copies include plain
			
 
				+ASCII without markup, Texinfo input format, La@TeX{} input
			
 
				+format, SGML or XML using a publicly available
			
 
				+DTD, and standard-conforming simple HTML,
			
 
				+PostScript or PDF designed for human modification.  Examples
			
 
				+of transparent image formats include PNG, XCF and
			
 
				+JPG.  Opaque formats include proprietary formats that can be
			
 
				+read and edited only by proprietary word processors, SGML or
			
 
				+XML for which the DTD and/or processing tools are
			
 
				+not generally available, and the machine-generated HTML,
			
 
				+PostScript or PDF produced by some word processors for
			
 
				+output purposes only.
			
 
				+
			
 
				+The ``Title Page'' means, for a printed book, the title page itself,
			
 
				+plus such following pages as are needed to hold, legibly, the material
			
 
				+this License requires to appear in the title page.  For works in
			
 
				+formats which do not have any title page as such, ``Title Page'' means
			
 
				+the text near the most prominent appearance of the work's title,
			
 
				+preceding the beginning of the body of the text.
			
 
				+
			
 
				+The ``publisher'' means any person or entity that distributes copies
			
 
				+of the Document to the public.
			
 
				+
			
 
				+A section ``Entitled XYZ'' means a named subunit of the Document whose
			
 
				+title either is precisely XYZ or contains XYZ in parentheses following
			
 
				+text that translates XYZ in another language.  (Here XYZ stands for a
			
 
				+specific section name mentioned below, such as ``Acknowledgements'',
			
 
				+``Dedications'', ``Endorsements'', or ``History''.)  To ``Preserve the Title''
			
 
				+of such a section when you modify the Document means that it remains a
			
 
				+section ``Entitled XYZ'' according to this definition.
			
 
				+
			
 
				+The Document may include Warranty Disclaimers next to the notice which
			
 
				+states that this License applies to the Document.  These Warranty
			
 
				+Disclaimers are considered to be included by reference in this
			
 
				+License, but only as regards disclaiming warranties: any other
			
 
				+implication that these Warranty Disclaimers may have is void and has
			
 
				+no effect on the meaning of this License.
			
 
				+
			
 
				+@item
			
 
				+VERBATIM COPYING
			
 
				+
			
 
				+You may copy and distribute the Document in any medium, either
			
 
				+commercially or noncommercially, provided that this License, the
			
 
				+copyright notices, and the license notice saying this License applies
			
 
				+to the Document are reproduced in all copies, and that you add no other
			
 
				+conditions whatsoever to those of this License.  You may not use
			
 
				+technical measures to obstruct or control the reading or further
			
 
				+copying of the copies you make or distribute.  However, you may accept
			
 
				+compensation in exchange for copies.  If you distribute a large enough
			
 
				+number of copies you must also follow the conditions in section 3.
			
 
				+
			
 
				+You may also lend copies, under the same conditions stated above, and
			
 
				+you may publicly display copies.
			
 
				+
			
 
				+@item
			
 
				+COPYING IN QUANTITY
			
 
				+
			
 
				+If you publish printed copies (or copies in media that commonly have
			
 
				+printed covers) of the Document, numbering more than 100, and the
			
 
				+Document's license notice requires Cover Texts, you must enclose the
			
 
				+copies in covers that carry, clearly and legibly, all these Cover
			
 
				+Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on
			
 
				+the back cover.  Both covers must also clearly and legibly identify
			
 
				+you as the publisher of these copies.  The front cover must present
			
 
				+the full title with all words of the title equally prominent and
			
 
				+visible.  You may add other material on the covers in addition.
			
 
				+Copying with changes limited to the covers, as long as they preserve
			
 
				+the title of the Document and satisfy these conditions, can be treated
			
 
				+as verbatim copying in other respects.
			
 
				+
			
 
				+If the required texts for either cover are too voluminous to fit
			
 
				+legibly, you should put the first ones listed (as many as fit
			
 
				+reasonably) on the actual cover, and continue the rest onto adjacent
			
 
				+pages.
			
 
				+
			
 
				+If you publish or distribute Opaque copies of the Document numbering
			
 
				+more than 100, you must either include a machine-readable Transparent
			
 
				+copy along with each Opaque copy, or state in or with each Opaque copy
			
 
				+a computer-network location from which the general network-using
			
 
				+public has access to download using public-standard network protocols
			
 
				+a complete Transparent copy of the Document, free of added material.
			
 
				+If you use the latter option, you must take reasonably prudent steps,
			
 
				+when you begin distribution of Opaque copies in quantity, to ensure
			
 
				+that this Transparent copy will remain thus accessible at the stated
			
 
				+location until at least one year after the last time you distribute an
			
 
				+Opaque copy (directly or through your agents or retailers) of that
			
 
				+edition to the public.
			
 
				+
			
 
				+It is requested, but not required, that you contact the authors of the
			
 
				+Document well before redistributing any large number of copies, to give
			
 
				+them a chance to provide you with an updated version of the Document.
			
 
				+
			
 
				+@item
			
 
				+MODIFICATIONS
			
 
				+
			
 
				+You may copy and distribute a Modified Version of the Document under
			
 
				+the conditions of sections 2 and 3 above, provided that you release
			
 
				+the Modified Version under precisely this License, with the Modified
			
 
				+Version filling the role of the Document, thus licensing distribution
			
 
				+and modification of the Modified Version to whoever possesses a copy
			
 
				+of it.  In addition, you must do these things in the Modified Version:
			
 
				+
			
 
				+@enumerate A
			
 
				+@item
			
 
				+Use in the Title Page (and on the covers, if any) a title distinct
			
 
				+from that of the Document, and from those of previous versions
			
 
				+(which should, if there were any, be listed in the History section
			
 
				+of the Document).  You may use the same title as a previous version
			
 
				+if the original publisher of that version gives permission.
			
 
				+
			
 
				+@item
			
 
				+List on the Title Page, as authors, one or more persons or entities
			
 
				+responsible for authorship of the modifications in the Modified
			
 
				+Version, together with at least five of the principal authors of the
			
 
				+Document (all of its principal authors, if it has fewer than five),
			
 
				+unless they release you from this requirement.
			
 
				+
			
 
				+@item
			
 
				+State on the Title page the name of the publisher of the
			
 
				+Modified Version, as the publisher.
			
 
				+
			
 
				+@item
			
 
				+Preserve all the copyright notices of the Document.
			
 
				+
			
 
				+@item
			
 
				+Add an appropriate copyright notice for your modifications
			
 
				+adjacent to the other copyright notices.
			
 
				+
			
 
				+@item
			
 
				+Include, immediately after the copyright notices, a license notice
			
 
				+giving the public permission to use the Modified Version under the
			
 
				+terms of this License, in the form shown in the Addendum below.
			
 
				+
			
 
				+@item
			
 
				+Preserve in that license notice the full lists of Invariant Sections
			
 
				+and required Cover Texts given in the Document's license notice.
			
 
				+
			
 
				+@item
			
 
				+Include an unaltered copy of this License.
			
 
				+
			
 
				+@item
			
 
				+Preserve the section Entitled ``History'', Preserve its Title, and add
			
 
				+to it an item stating at least the title, year, new authors, and
			
 
				+publisher of the Modified Version as given on the Title Page.  If
			
 
				+there is no section Entitled ``History'' in the Document, create one
			
 
				+stating the title, year, authors, and publisher of the Document as
			
 
				+given on its Title Page, then add an item describing the Modified
			
 
				+Version as stated in the previous sentence.
			
 
				+
			
 
				+@item
			
 
				+Preserve the network location, if any, given in the Document for
			
 
				+public access to a Transparent copy of the Document, and likewise
			
 
				+the network locations given in the Document for previous versions
			
 
				+it was based on.  These may be placed in the ``History'' section.
			
 
				+You may omit a network location for a work that was published at
			
 
				+least four years before the Document itself, or if the original
			
 
				+publisher of the version it refers to gives permission.
			
 
				+
			
 
				+@item
			
 
				+For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve
			
 
				+the Title of the section, and preserve in the section all the
			
 
				+substance and tone of each of the contributor acknowledgements and/or
			
 
				+dedications given therein.
			
 
				+
			
 
				+@item
			
 
				+Preserve all the Invariant Sections of the Document,
			
 
				+unaltered in their text and in their titles.  Section numbers
			
 
				+or the equivalent are not considered part of the section titles.
			
 
				+
			
 
				+@item
			
 
				+Delete any section Entitled ``Endorsements''.  Such a section
			
 
				+may not be included in the Modified Version.
			
 
				+
			
 
				+@item
			
 
				+Do not retitle any existing section to be Entitled ``Endorsements'' or
			
 
				+to conflict in title with any Invariant Section.
			
 
				+
			
 
				+@item
			
 
				+Preserve any Warranty Disclaimers.
			
 
				+@end enumerate
			
 
				+
			
 
				+If the Modified Version includes new front-matter sections or
			
 
				+appendices that qualify as Secondary Sections and contain no material
			
 
				+copied from the Document, you may at your option designate some or all
			
 
				+of these sections as invariant.  To do this, add their titles to the
			
 
				+list of Invariant Sections in the Modified Version's license notice.
			
 
				+These titles must be distinct from any other section titles.
			
 
				+
			
 
				+You may add a section Entitled ``Endorsements'', provided it contains
			
 
				+nothing but endorsements of your Modified Version by various
			
 
				+parties---for example, statements of peer review or that the text has
			
 
				+been approved by an organization as the authoritative definition of a
			
 
				+standard.
			
 
				+
			
 
				+You may add a passage of up to five words as a Front-Cover Text, and a
			
 
				+passage of up to 25 words as a Back-Cover Text, to the end of the list
			
 
				+of Cover Texts in the Modified Version.  Only one passage of
			
 
				+Front-Cover Text and one of Back-Cover Text may be added by (or
			
 
				+through arrangements made by) any one entity.  If the Document already
			
 
				+includes a cover text for the same cover, previously added by you or
			
 
				+by arrangement made by the same entity you are acting on behalf of,
			
 
				+you may not add another; but you may replace the old one, on explicit
			
 
				+permission from the previous publisher that added the old one.
			
 
				+
			
 
				+The author(s) and publisher(s) of the Document do not by this License
			
 
				+give permission to use their names for publicity for or to assert or
			
 
				+imply endorsement of any Modified Version.
			
 
				+
			
 
				+@item
			
 
				+COMBINING DOCUMENTS
			
 
				+
			
 
				+You may combine the Document with other documents released under this
			
 
				+License, under the terms defined in section 4 above for modified
			
 
				+versions, provided that you include in the combination all of the
			
 
				+Invariant Sections of all of the original documents, unmodified, and
			
 
				+list them all as Invariant Sections of your combined work in its
			
 
				+license notice, and that you preserve all their Warranty Disclaimers.
			
 
				+
			
 
				+The combined work need only contain one copy of this License, and
			
 
				+multiple identical Invariant Sections may be replaced with a single
			
 
				+copy.  If there are multiple Invariant Sections with the same name but
			
 
				+different contents, make the title of each such section unique by
			
 
				+adding at the end of it, in parentheses, the name of the original
			
 
				+author or publisher of that section if known, or else a unique number.
			
 
				+Make the same adjustment to the section titles in the list of
			
 
				+Invariant Sections in the license notice of the combined work.
			
 
				+
			
 
				+In the combination, you must combine any sections Entitled ``History''
			
 
				+in the various original documents, forming one section Entitled
			
 
				+``History''; likewise combine any sections Entitled ``Acknowledgements'',
			
 
				+and any sections Entitled ``Dedications''.  You must delete all
			
 
				+sections Entitled ``Endorsements.''
			
 
				+
			
 
				+@item
			
 
				+COLLECTIONS OF DOCUMENTS
			
 
				+
			
 
				+You may make a collection consisting of the Document and other documents
			
 
				+released under this License, and replace the individual copies of this
			
 
				+License in the various documents with a single copy that is included in
			
 
				+the collection, provided that you follow the rules of this License for
			
 
				+verbatim copying of each of the documents in all other respects.
			
 
				+
			
 
				+You may extract a single document from such a collection, and distribute
			
 
				+it individually under this License, provided you insert a copy of this
			
 
				+License into the extracted document, and follow this License in all
			
 
				+other respects regarding verbatim copying of that document.
			
 
				+
			
 
				+@item
			
 
				+AGGREGATION WITH INDEPENDENT WORKS
			
 
				+
			
 
				+A compilation of the Document or its derivatives with other separate
			
 
				+and independent documents or works, in or on a volume of a storage or
			
 
				+distribution medium, is called an ``aggregate'' if the copyright
			
 
				+resulting from the compilation is not used to limit the legal rights
			
 
				+of the compilation's users beyond what the individual works permit.
			
 
				+When the Document is included in an aggregate, this License does not
			
 
				+apply to the other works in the aggregate which are not themselves
			
 
				+derivative works of the Document.
			
 
				+
			
 
				+If the Cover Text requirement of section 3 is applicable to these
			
 
				+copies of the Document, then if the Document is less than one half of
			
 
				+the entire aggregate, the Document's Cover Texts may be placed on
			
 
				+covers that bracket the Document within the aggregate, or the
			
 
				+electronic equivalent of covers if the Document is in electronic form.
			
 
				+Otherwise they must appear on printed covers that bracket the whole
			
 
				+aggregate.
			
 
				+
			
 
				+@item
			
 
				+TRANSLATION
			
 
				+
			
 
				+Translation is considered a kind of modification, so you may
			
 
				+distribute translations of the Document under the terms of section 4.
			
 
				+Replacing Invariant Sections with translations requires special
			
 
				+permission from their copyright holders, but you may include
			
 
				+translations of some or all Invariant Sections in addition to the
			
 
				+original versions of these Invariant Sections.  You may include a
			
 
				+translation of this License, and all the license notices in the
			
 
				+Document, and any Warranty Disclaimers, provided that you also include
			
 
				+the original English version of this License and the original versions
			
 
				+of those notices and disclaimers.  In case of a disagreement between
			
 
				+the translation and the original version of this License or a notice
			
 
				+or disclaimer, the original version will prevail.
			
 
				+
			
 
				+If a section in the Document is Entitled ``Acknowledgements'',
			
 
				+``Dedications'', or ``History'', the requirement (section 4) to Preserve
			
 
				+its Title (section 1) will typically require changing the actual
			
 
				+title.
			
 
				+
			
 
				+@item
			
 
				+TERMINATION
			
 
				+
			
 
				+You may not copy, modify, sublicense, or distribute the Document
			
 
				+except as expressly provided under this License.  Any attempt
			
 
				+otherwise to copy, modify, sublicense, or distribute it is void, and
			
 
				+will automatically terminate your rights under this License.
			
 
				+
			
 
				+However, if you cease all violation of this License, then your license
			
 
				+from a particular copyright holder is reinstated (a) provisionally,
			
 
				+unless and until the copyright holder explicitly and finally
			
 
				+terminates your license, and (b) permanently, if the copyright holder
			
 
				+fails to notify you of the violation by some reasonable means prior to
			
 
				+60 days after the cessation.
			
 
				+
			
 
				+Moreover, your license from a particular copyright holder is
			
 
				+reinstated permanently if the copyright holder notifies you of the
			
 
				+violation by some reasonable means, this is the first time you have
			
 
				+received notice of violation of this License (for any work) from that
			
 
				+copyright holder, and you cure the violation prior to 30 days after
			
 
				+your receipt of the notice.
			
 
				+
			
 
				+Termination of your rights under this section does not terminate the
			
 
				+licenses of parties who have received copies or rights from you under
			
 
				+this License.  If your rights have been terminated and not permanently
			
 
				+reinstated, receipt of a copy of some or all of the same material does
			
 
				+not give you any rights to use it.
			
 
				+
			
 
				+@item
			
 
				+FUTURE REVISIONS OF THIS LICENSE
			
 
				+
			
 
				+The Free Software Foundation may publish new, revised versions
			
 
				+of the GNU Free Documentation License from time to time.  Such new
			
 
				+versions will be similar in spirit to the present version, but may
			
 
				+differ in detail to address new problems or concerns.  See
			
 
				+@uref{http://www.gnu.org/copyleft/}.
			
 
				+
			
 
				+Each version of the License is given a distinguishing version number.
			
 
				+If the Document specifies that a particular numbered version of this
			
 
				+License ``or any later version'' applies to it, you have the option of
			
 
				+following the terms and conditions either of that specified version or
			
 
				+of any later version that has been published (not as a draft) by the
			
 
				+Free Software Foundation.  If the Document does not specify a version
			
 
				+number of this License, you may choose any version ever published (not
			
 
				+as a draft) by the Free Software Foundation.  If the Document
			
 
				+specifies that a proxy can decide which future versions of this
			
 
				+License can be used, that proxy's public statement of acceptance of a
			
 
				+version permanently authorizes you to choose that version for the
			
 
				+Document.
			
 
				+
			
 
				+@item
			
 
				+RELICENSING
			
 
				+
			
 
				+``Massive Multiauthor Collaboration Site'' (or ``MMC Site'') means any
			
 
				+World Wide Web server that publishes copyrightable works and also
			
 
				+provides prominent facilities for anybody to edit those works.  A
			
 
				+public wiki that anybody can edit is an example of such a server.  A
			
 
				+``Massive Multiauthor Collaboration'' (or ``MMC'') contained in the
			
 
				+site means any set of copyrightable works thus published on the MMC
			
 
				+site.
			
 
				+
			
 
				+``CC-BY-SA'' means the Creative Commons Attribution-Share Alike 3.0
			
 
				+license published by Creative Commons Corporation, a not-for-profit
			
 
				+corporation with a principal place of business in San Francisco,
			
 
				+California, as well as future copyleft versions of that license
			
 
				+published by that same organization.
			
 
				+
			
 
				+``Incorporate'' means to publish or republish a Document, in whole or
			
 
				+in part, as part of another Document.
			
 
				+
			
 
				+An MMC is ``eligible for relicensing'' if it is licensed under this
			
 
				+License, and if all works that were first published under this License
			
 
				+somewhere other than this MMC, and subsequently incorporated in whole
			
 
				+or in part into the MMC, (1) had no cover texts or invariant sections,
			
 
				+and (2) were thus incorporated prior to November 1, 2008.
			
 
				+
			
 
				+The operator of an MMC Site may republish an MMC contained in the site
			
 
				+under CC-BY-SA on the same site at any time before August 1, 2009,
			
 
				+provided the MMC is eligible for relicensing.
			
 
				+
			
 
				+@end enumerate
			
 
				+
			
 
				+@page
			
 
				+@heading ADDENDUM: How to use this License for your documents
			
 
				+
			
 
				+To use this License in a document you have written, include a copy of
			
 
				+the License in the document and put the following copyright and
			
 
				+license notices just after the title page:
			
 
				+
			
 
				+@smallexample
			
 
				+@group
			
 
				+  Copyright (C)  @var{year}  @var{your name}.
			
 
				+  Permission is granted to copy, distribute and/or modify this document
			
 
				+  under the terms of the GNU Free Documentation License, Version 1.3
			
 
				+  or any later version published by the Free Software Foundation;
			
 
				+  with no Invariant Sections, no Front-Cover Texts, and no Back-Cover
			
 
				+  Texts.  A copy of the license is included in the section entitled ``GNU
			
 
				+  Free Documentation License''.
			
 
				+@end group
			
 
				+@end smallexample
			
 
				+
			
 
				+If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts,
			
 
				+replace the ``with@dots{}Texts.'' line with this:
			
 
				+
			
 
				+@smallexample
			
 
				+@group
			
 
				+    with the Invariant Sections being @var{list their titles}, with
			
 
				+    the Front-Cover Texts being @var{list}, and with the Back-Cover Texts
			
 
				+    being @var{list}.
			
 
				+@end group
			
 
				+@end smallexample
			
 
				+
			
 
				+If you have Invariant Sections without Cover Texts, or some other
			
 
				+combination of the three, merge those two alternatives to suit the
			
 
				+situation.
			
 
				+
			
 
				+If your document contains nontrivial examples of program code, we
			
 
				+recommend releasing these examples in parallel under your choice of
			
 
				+free software license, such as the GNU General Public License,
			
 
				+to permit their use in free software.
			
 
				+
			
 
				+@c Local Variables:
			
 
				+@c ispell-local-pdict: "ispell-dict"
			
 
				+@c End:
			
 
				+
			
--- a/doc/chapters/fft-support.texi
+++ b/doc/chapters/fft-support.texi
@@ -0,0 +1,107 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+StarPU provides @code{libstarpufft}, a library whose design is very similar to
			
 
				+both fftw and cufft, the difference being that it takes benefit from both CPUs
			
 
				+and GPUs. It should however be noted that GPUs do not have the same precision as
			
 
				+CPUs, so the results may different by a negligible amount
			
 
				+
			
 
				+float, double and long double precisions are available, with the fftw naming
			
 
				+convention:
			
 
				+
			
 
				+@enumerate
			
 
				+@item double precision structures and functions are named e.g. @code{starpufft_execute}
			
 
				+@item float precision structures and functions are named e.g. @code{starpufftf_execute}
			
 
				+@item long double precision structures and functions are named e.g. @code{starpufftl_execute}
			
 
				+@end enumerate
			
 
				+
			
 
				+The documentation below uses names for double precision, replace
			
 
				+@code{starpufft_} with @code{starpufftf_} or @code{starpufftl_} as appropriate.
			
 
				+
			
 
				+Only complex numbers are supported at the moment.
			
 
				+
			
 
				+The application has to call @code{starpu_init} before calling starpufft functions.
			
 
				+
			
 
				+Either main memory pointers or data handles can be provided.
			
 
				+
			
 
				+@enumerate
			
 
				+@item To provide main memory pointers, use @code{starpufft_start} or
			
 
				+@code{starpufft_execute}. Only one FFT can be performed at a time, because
			
 
				+StarPU will have to register the data on the fly. In the @code{starpufft_start}
			
 
				+case, @code{starpufft_cleanup} needs to be called to unregister the data.
			
 
				+@item To provide data handles (which is preferrable),
			
 
				+use @code{starpufft_start_handle} (preferred) or
			
 
				+@code{starpufft_execute_handle}. Several FFTs Several FFT tasks can be submitted
			
 
				+for a given plan, which permits e.g. to start a series of FFT with just one
			
 
				+plan. @code{starpufft_start_handle} is preferrable since it does not wait for
			
 
				+the task completion, and thus permits to enqueue a series of tasks.
			
 
				+@end enumerate
			
 
				+
			
 
				+@section Compilation
			
 
				+
			
 
				+The flags required to compile or link against the FFT library are accessible
			
 
				+with the following commands:
			
 
				+
			
 
				+@example
			
 
				+% pkg-config --cflags starpufft-1.0  # options for the compiler
			
 
				+% pkg-config --libs starpufft-1.0    # options for the linker
			
 
				+@end example
			
 
				+
			
 
				+Also pass the @code{--static} option if the application is to be linked statically.
			
 
				+
			
 
				+@section Initialisation
			
 
				+
			
 
				+@deftypefun {void *} starpufft_malloc (size_t @var{n})
			
 
				+Allocates memory for @var{n} bytes. This is preferred over @code{malloc}, since
			
 
				+it allocates pinned memory, which allows overlapped transfers.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {void *} starpufft_free (void *@var{p})
			
 
				+Release memory previously allocated.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_1d (int @var{n}, int @var{sign}, unsigned @var{flags})
			
 
				+Initializes a plan for 1D FFT of size @var{n}. @var{sign} can be
			
 
				+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpufft_plan *} starpufft_plan_dft_2d (int @var{n}, int @var{m}, int @var{sign}, unsigned @var{flags})
			
 
				+Initializes a plan for 2D FFT of size (@var{n}, @var{m}). @var{sign} can be
			
 
				+@code{STARPUFFT_FORWARD} or @code{STARPUFFT_INVERSE}. @var{flags} must be 0.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpufft_start (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
			
 
				+Start an FFT previously planned as @var{p}, using @var{in} and @var{out} as
			
 
				+input and output. This only submits the task and does not wait for it.
			
 
				+The application should call @code{starpufft_cleanup} to unregister the data.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun {struct starpu_task *} starpufft_start_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
			
 
				+Start an FFT previously planned as @var{p}, using data handles @var{in} and
			
 
				+@var{out} as input and output (assumed to be vectors of elements of the expected
			
 
				+types). This only submits the task and does not wait for it.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpufft_execute (starpufft_plan @var{p}, void *@var{in}, void *@var{out})
			
 
				+Execute an FFT previously planned as @var{p}, using @var{in} and @var{out} as
			
 
				+input and output. This submits and waits for the task.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpufft_execute_handle (starpufft_plan @var{p}, starpu_data_handle_t @var{in}, starpu_data_handle_t @var{out})
			
 
				+Execute an FFT previously planned as @var{p}, using data handles @var{in} and
			
 
				+@var{out} as input and output (assumed to be vectors of elements of the expected
			
 
				+types). This submits and waits for the task.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpufft_cleanup (starpufft_plan @var{p})
			
 
				+Releases data for plan @var{p}, in the @code{starpufft_start} case.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpufft_destroy_plan (starpufft_plan @var{p})
			
 
				+Destroys plan @var{p}, i.e. release all CPU (fftw) and GPU (cufft) resources.
			
 
				+@end deftypefun
			
--- a/doc/chapters/installing.texi
+++ b/doc/chapters/installing.texi
@@ -0,0 +1,130 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Downloading StarPU::          
			
 
				+* Configuration of StarPU::     
			
 
				+* Building and Installing StarPU::  
			
 
				+@end menu
			
 
				+
			
 
				+StarPU can be built and installed by the standard means of the GNU
			
 
				+autotools. The following chapter is intended to briefly remind how these tools
			
 
				+can be used to install StarPU.
			
 
				+
			
 
				+@node Downloading StarPU
			
 
				+@section Downloading StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Getting Sources::             
			
 
				+* Optional dependencies::       
			
 
				+@end menu
			
 
				+
			
 
				+@node Getting Sources
			
 
				+@subsection Getting Sources
			
 
				+
			
 
				+The latest official release tarballs of StarPU sources are available
			
 
				+for download from
			
 
				+@indicateurl{https://gforge.inria.fr/frs/?group_id=1570}.
			
 
				+
			
 
				+The latest nightly development snapshot is available from
			
 
				+@indicateurl{http://starpu.gforge.inria.fr/testing/}.
			
 
				+
			
 
				+@example
			
 
				+% wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
			
 
				+@end example
			
 
				+
			
 
				+Additionally, the code can be directly checked out of Subversion, it
			
 
				+should be done only if you need the very latest changes (i.e. less
			
 
				+than a day!).@footnote{The client side of the software Subversion can
			
 
				+be obtained from @indicateurl{http://subversion.tigris.org}. If you
			
 
				+are running on Windows, you will probably prefer to use TortoiseSVN
			
 
				+from @indicateurl{http://tortoisesvn.tigris.org/}}.
			
 
				+
			
 
				+@example
			
 
				+% svn checkout svn://scm.gforge.inria.fr/svn/starpu/trunk
			
 
				+@end example
			
 
				+
			
 
				+@node Optional dependencies
			
 
				+@subsection Optional dependencies
			
 
				+
			
 
				+The topology discovery library, @code{hwloc}, is not mandatory to use StarPU
			
 
				+but strongly recommended. It allows to increase performance, and to
			
 
				+perform some topology aware scheduling.
			
 
				+
			
 
				+@code{hwloc} is available in major distributions and for most OSes and can be
			
 
				+downloaded from @indicateurl{http://www.open-mpi.org/software/hwloc}.
			
 
				+
			
 
				+@node Configuration of StarPU
			
 
				+@section Configuration of StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Generating Makefiles and configuration scripts::  
			
 
				+* Running the configuration::   
			
 
				+@end menu
			
 
				+
			
 
				+@node Generating Makefiles and configuration scripts
			
 
				+@subsection Generating Makefiles and configuration scripts
			
 
				+
			
 
				+This step is not necessary when using the tarball releases of StarPU.  If you
			
 
				+are using the source code from the svn repository, you first need to generate
			
 
				+the configure scripts and the Makefiles. This requires the
			
 
				+availability of @code{autoconf}, @code{automake} >= 2.60, and @code{makeinfo}.
			
 
				+
			
 
				+@example
			
 
				+% ./autogen.sh
			
 
				+@end example
			
 
				+
			
 
				+@node Running the configuration
			
 
				+@subsection Running the configuration
			
 
				+
			
 
				+@example
			
 
				+% ./configure
			
 
				+@end example
			
 
				+
			
 
				+Details about options that are useful to give to @code{./configure} are given in
			
 
				+@ref{Compilation configuration}.
			
 
				+
			
 
				+@node Building and Installing StarPU
			
 
				+@section Building and Installing StarPU
			
 
				+
			
 
				+@menu
			
 
				+* Building::                    
			
 
				+* Sanity Checks::               
			
 
				+* Installing::                  
			
 
				+@end menu
			
 
				+
			
 
				+@node Building
			
 
				+@subsection Building
			
 
				+
			
 
				+@example
			
 
				+% make
			
 
				+@end example
			
 
				+
			
 
				+@node Sanity Checks
			
 
				+@subsection Sanity Checks
			
 
				+
			
 
				+In order to make sure that StarPU is working properly on the system, it is also
			
 
				+possible to run a test suite.
			
 
				+
			
 
				+@example
			
 
				+% make check
			
 
				+@end example
			
 
				+
			
 
				+@node Installing
			
 
				+@subsection Installing
			
 
				+
			
 
				+In order to install StarPU at the location that was specified during
			
 
				+configuration:
			
 
				+
			
 
				+@example
			
 
				+% make install
			
 
				+@end example
			
 
				+
			
 
				+Libtool interface versioning information are included in
			
 
				+libraries names (libstarpu-1.0.so, libstarpumpi-1.0.so and
			
 
				+libstarpufft-1.0.so).
			
--- a/doc/chapters/introduction.texi
+++ b/doc/chapters/introduction.texi
@@ -0,0 +1,186 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Motivation::                  Why StarPU ?
			
 
				+* StarPU in a Nutshell::        The Fundamentals of StarPU
			
 
				+@end menu
			
 
				+
			
 
				+@node Motivation
			
 
				+@section Motivation
			
 
				+
			
 
				+@c complex machines with heterogeneous cores/devices
			
 
				+The use of specialized hardware such as accelerators or coprocessors offers an
			
 
				+interesting approach to overcome the physical limits encountered by processor
			
 
				+architects. As a result, many machines are now equipped with one or several
			
 
				+accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
			
 
				+efforts have been devoted to offload computation onto such accelerators, very
			
 
				+little attention as been paid to portability concerns on the one hand, and to the
			
 
				+possibility of having heterogeneous accelerators and processors to interact on the other hand.
			
 
				+
			
 
				+StarPU is a runtime system that offers support for heterogeneous multicore
			
 
				+architectures, it not only offers a unified view of the computational resources
			
 
				+(i.e. CPUs and accelerators at the same time), but it also takes care of
			
 
				+efficiently mapping and executing tasks onto an heterogeneous machine while
			
 
				+transparently handling low-level issues such as data transfers in a portable
			
 
				+fashion.
			
 
				+
			
 
				+@c this leads to a complicated distributed memory design
			
 
				+@c which is not (easily) manageable by hand
			
 
				+
			
 
				+@c added value/benefits of StarPU
			
 
				+@c   - portability
			
 
				+@c   - scheduling, perf. portability
			
 
				+
			
 
				+@node StarPU in a Nutshell
			
 
				+@section StarPU in a Nutshell
			
 
				+
			
 
				+StarPU is a software tool aiming to allow programmers to exploit the
			
 
				+computing power of the available CPUs and GPUs, while relieving them
			
 
				+from the need to specially adapt their programs to the target machine
			
 
				+and processing units.
			
 
				+
			
 
				+At the core of StarPU is its run-time support library, which is
			
 
				+responsible for scheduling application-provided tasks on heterogeneous
			
 
				+CPU/GPU machines.  In addition, StarPU comes with programming language
			
 
				+support, in the form of extensions to languages of the C family
			
 
				+(@pxref{C Extensions}), as well as an OpenCL front-end (@pxref{SOCL
			
 
				+OpenCL Extensions}).
			
 
				+
			
 
				+@cindex task-based programming model
			
 
				+StarPU's run-time and programming language extensions support a
			
 
				+@dfn{task-based programming model}.  Applications submit computational
			
 
				+tasks, with CPU and/or GPU implementations, and StarPU schedules these
			
 
				+tasks and associated data transfers on available CPUs and GPUs.  The
			
 
				+data that a task manipulates are automatically transferred among
			
 
				+accelerators and the main memory, so that programmers are freed from the
			
 
				+scheduling issues and technical details associated with these transfers.
			
 
				+
			
 
				+StarPU takes particular care of scheduling tasks efficiently, using
			
 
				+well-known algorithms from the literature (@pxref{Task scheduling
			
 
				+policy}).  In addition, it allows scheduling experts, such as compiler
			
 
				+or computational library developers, to implement custom scheduling
			
 
				+policies in a portable fashion (@pxref{Scheduling Policy API}).
			
 
				+
			
 
				+The remainder of this section describes the main concepts used in StarPU.
			
 
				+
			
 
				+@menu
			
 
				+* Codelet and Tasks::           
			
 
				+* StarPU Data Management Library::  
			
 
				+* Glossary::
			
 
				+* Research Papers::
			
 
				+@end menu
			
 
				+
			
 
				+@c explain the notion of codelet and task (i.e. g(A, B)
			
 
				+@node Codelet and Tasks
			
 
				+@subsection Codelet and Tasks
			
 
				+
			
 
				+One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
			
 
				+computational kernel that can possibly be implemented on multiple architectures
			
 
				+such as a CPU, a CUDA device or a Cell's SPU.
			
 
				+
			
 
				+@c TODO insert illustration f: f_spu, f_cpu, ...
			
 
				+
			
 
				+Another important data structure is the @b{task}. Executing a StarPU task
			
 
				+consists in applying a codelet on a data set, on one of the architectures on
			
 
				+which the codelet is implemented. A task thus describes the codelet that it
			
 
				+uses, but also which data are accessed, and how they are
			
 
				+accessed during the computation (read and/or write).
			
 
				+StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
			
 
				+operation. The task structure can also specify a @b{callback} function that is
			
 
				+called once StarPU has properly executed the task. It also contains optional
			
 
				+fields that the application may use to give hints to the scheduler (such as
			
 
				+priority levels).
			
 
				+
			
 
				+By default, task dependencies are inferred from data dependency (sequential
			
 
				+coherence) by StarPU. The application can however disable sequential coherency
			
 
				+for some data, and dependencies be expressed by hand.
			
 
				+A task may be identified by a unique 64-bit number chosen by the application
			
 
				+which we refer as a @b{tag}.
			
 
				+Task dependencies can be enforced by hand either by the means of callback functions, by
			
 
				+submitting other tasks, or by expressing dependencies
			
 
				+between tags (which can thus correspond to tasks that have not been submitted
			
 
				+yet).
			
 
				+
			
 
				+@c TODO insert illustration f(Ar, Brw, Cr) + ..
			
 
				+
			
 
				+@c DSM
			
 
				+@node StarPU Data Management Library
			
 
				+@subsection StarPU Data Management Library
			
 
				+
			
 
				+Because StarPU schedules tasks at runtime, data transfers have to be
			
 
				+done automatically and ``just-in-time'' between processing units,
			
 
				+relieving the application programmer from explicit data transfers.
			
 
				+Moreover, to avoid unnecessary transfers, StarPU keeps data
			
 
				+where it was last needed, even if was modified there, and it
			
 
				+allows multiple copies of the same data to reside at the same time on
			
 
				+several processing units as long as it is not modified.
			
 
				+
			
 
				+@node Glossary
			
 
				+@subsection Glossary
			
 
				+
			
 
				+A @b{codelet} records pointers to various implementations of the same
			
 
				+theoretical function.
			
 
				+
			
 
				+A @b{memory node} can be either the main RAM or GPU-embedded memory.
			
 
				+
			
 
				+A @b{bus} is a link between memory nodes.
			
 
				+
			
 
				+A @b{data handle} keeps track of replicates of the same data (@b{registered} by the
			
 
				+application) over various memory nodes. The data management library manages
			
 
				+keeping them coherent.
			
 
				+
			
 
				+The @b{home} memory node of a data handle is the memory node from which the data
			
 
				+was registered (usually the main memory node).
			
 
				+
			
 
				+A @b{task} represents a scheduled execution of a codelet on some data handles.
			
 
				+
			
 
				+A @b{tag} is a rendez-vous point. Tasks typically have their own tag, and can
			
 
				+depend on other tags. The value is chosen by the application.
			
 
				+
			
 
				+A @b{worker} execute tasks. There is typically one per CPU computation core and
			
 
				+one per accelerator (for which a whole CPU core is dedicated).
			
 
				+
			
 
				+A @b{driver} drives a given kind of workers. There are currently CPU, CUDA,
			
 
				+OpenCL and Gordon drivers. They usually start several workers to actually drive
			
 
				+them.
			
 
				+
			
 
				+A @b{performance model} is a (dynamic or static) model of the performance of a
			
 
				+given codelet. Codelets can have execution time performance model as well as
			
 
				+power consumption performance models.
			
 
				+
			
 
				+A data @b{interface} describes the layout of the data: for a vector, a pointer
			
 
				+for the start, the number of elements and the size of elements ; for a matrix, a
			
 
				+pointer for the start, the number of elements per row, the offset between rows,
			
 
				+and the size of each element ; etc. To access their data, codelet functions are
			
 
				+given interfaces for the local memory node replicates of the data handles of the
			
 
				+scheduled task.
			
 
				+
			
 
				+@b{Partitioning} data means dividing the data of a given data handle (called
			
 
				+@b{father}) into a series of @b{children} data handles which designate various
			
 
				+portions of the former.
			
 
				+
			
 
				+A @b{filter} is the function which computes children data handles from a father
			
 
				+data handle, and thus describes how the partitioning should be done (horizontal,
			
 
				+vertical, etc.)
			
 
				+
			
 
				+@b{Acquiring} a data handle can be done from the main application, to safely
			
 
				+access the data of a data handle from its home node, without having to
			
 
				+unregister it.
			
 
				+
			
 
				+
			
 
				+@node Research Papers
			
 
				+@subsection Research Papers
			
 
				+
			
 
				+Research papers about StarPU can be found at
			
 
				+
			
 
				+@indicateurl{http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html}
			
 
				+
			
 
				+Notably a good overview in the research report
			
 
				+
			
 
				+@indicateurl{http://hal.archives-ouvertes.fr/inria-00467677}
			
--- a/doc/chapters/mpi-support.texi
+++ b/doc/chapters/mpi-support.texi
@@ -0,0 +1,432 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+The integration of MPI transfers within task parallelism is done in a
			
 
				+very natural way by the means of asynchronous interactions between the
			
 
				+application and StarPU.  This is implemented in a separate libstarpumpi library
			
 
				+which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
			
 
				+@code{void *} buffers are replaced with @code{starpu_data_handle_t}s, and all
			
 
				+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
			
 
				+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
			
 
				+the different MPI nodes.
			
 
				+
			
 
				+An MPI Insert Task function provides an even more seamless transition to a
			
 
				+distributed application, by automatically issuing all required data transfers
			
 
				+according to the task graph and an application-provided distribution.
			
 
				+
			
 
				+@menu
			
 
				+* The API::                     
			
 
				+* Simple Example::              
			
 
				+* MPI Insert Task Utility::         
			
 
				+* MPI Collective Operations::         
			
 
				+@end menu
			
 
				+
			
 
				+@node The API
			
 
				+@section The API
			
 
				+
			
 
				+@subsection Compilation
			
 
				+
			
 
				+The flags required to compile or link against the MPI layer are then
			
 
				+accessible with the following commands:
			
 
				+
			
 
				+@example
			
 
				+% pkg-config --cflags starpumpi-1.0  # options for the compiler
			
 
				+% pkg-config --libs starpumpi-1.0    # options for the linker
			
 
				+@end example
			
 
				+
			
 
				+Also pass the @code{--static} option if the application is to be linked statically.
			
 
				+
			
 
				+@subsection Initialisation
			
 
				+
			
 
				+@deftypefun int starpu_mpi_initialize (void)
			
 
				+Initializes the starpumpi library. This must be called between calling
			
 
				+@code{starpu_init} and other @code{starpu_mpi} functions. This
			
 
				+function does not call @code{MPI_Init}, it should be called beforehand.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_initialize_extended (int *@var{rank}, int *@var{world_size})
			
 
				+Initializes the starpumpi library. This must be called between calling
			
 
				+@code{starpu_init} and other @code{starpu_mpi} functions.
			
 
				+This function calls @code{MPI_Init}, and therefore should be prefered
			
 
				+to the previous one for MPI implementations which are not thread-safe.
			
 
				+Returns the current MPI node rank and world size.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_shutdown (void)
			
 
				+Cleans the starpumpi library. This must be called between calling
			
 
				+@code{starpu_mpi} functions and @code{starpu_shutdown}.
			
 
				+@code{MPI_Finalize} will be called if StarPU-MPI has been initialized
			
 
				+by calling @code{starpu_mpi_initialize_extended}.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@subsection Communication
			
 
				+
			
 
				+@deftypefun int starpu_mpi_send (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_recv (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv (starpu_data_handle_t @var{data_handle}, starpu_mpi_req *@var{req}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_detached (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_detached (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, void (*@var{callback})(void *), void *@var{arg})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_wait (starpu_mpi_req *@var{req}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_test (starpu_mpi_req *@var{req}, int *@var{flag}, MPI_Status *@var{status})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_barrier (MPI_Comm @var{comm})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{dest}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
			
 
				+When the transfer is completed, the tag is unlocked
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t @var{data_handle}, int @var{source}, int @var{mpi_tag}, MPI_Comm @var{comm}, starpu_tag_t @var{tag})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_isend_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{dest}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
			
 
				+Asynchronously send an array of buffers, and unlocks the tag once all
			
 
				+of them are transmitted.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_irecv_array_detached_unlock_tag (unsigned @var{array_size}, starpu_data_handle_t *@var{data_handle}, int *@var{source}, int *@var{mpi_tag}, MPI_Comm *@var{comm}, starpu_tag_t @var{tag})
			
 
				+@end deftypefun
			
 
				+
			
 
				+@page
			
 
				+@node Simple Example
			
 
				+@section Simple Example
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void increment_token(void)
			
 
				+@{
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->cl = &increment_cl;
			
 
				+    task->handles[0] = token_handle;
			
 
				+
			
 
				+    starpu_task_submit(task);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int main(int argc, char **argv)
			
 
				+@{
			
 
				+    int rank, size;
			
 
				+
			
 
				+    starpu_init(NULL);
			
 
				+    starpu_mpi_initialize_extended(&rank, &size);
			
 
				+
			
 
				+    starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
			
 
				+
			
 
				+    unsigned nloops = NITER;
			
 
				+    unsigned loop;
			
 
				+
			
 
				+    unsigned last_loop = nloops - 1;
			
 
				+    unsigned last_rank = size - 1;
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    for (loop = 0; loop < nloops; loop++) @{
			
 
				+        int tag = loop*size + rank;
			
 
				+
			
 
				+        if (loop == 0 && rank == 0)
			
 
				+        @{
			
 
				+            token = 0;
			
 
				+            fprintf(stdout, "Start with token value %d\n", token);
			
 
				+        @}
			
 
				+        else
			
 
				+        @{
			
 
				+            starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag,
			
 
				+                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+        @}
			
 
				+
			
 
				+        increment_token();
			
 
				+
			
 
				+        if (loop == last_loop && rank == last_rank)
			
 
				+        @{
			
 
				+            starpu_data_acquire(token_handle, STARPU_R);
			
 
				+            fprintf(stdout, "Finished: token value %d\n", token);
			
 
				+            starpu_data_release(token_handle);
			
 
				+        @}
			
 
				+        else
			
 
				+        @{
			
 
				+            starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1,
			
 
				+                    MPI_COMM_WORLD, NULL, NULL);
			
 
				+        @}
			
 
				+    @}
			
 
				+
			
 
				+    starpu_task_wait_for_all();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    starpu_mpi_shutdown();
			
 
				+    starpu_shutdown();
			
 
				+
			
 
				+    if (rank == last_rank)
			
 
				+    @{
			
 
				+        fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
			
 
				+        STARPU_ASSERT(token == nloops*size);
			
 
				+    @}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@page
			
 
				+@node MPI Insert Task Utility
			
 
				+@section MPI Insert Task Utility
			
 
				+
			
 
				+To save the programmer from having to explicit all communications, StarPU
			
 
				+provides an "MPI Insert Task Utility". The principe is that the application
			
 
				+decides a distribution of the data over the MPI nodes by allocating it and
			
 
				+notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns" which
			
 
				+data. All MPI nodes then process the whole task graph, and StarPU automatically
			
 
				+determines which node actually execute which task, as well as the required MPI
			
 
				+transfers.
			
 
				+
			
 
				+@deftypefun int starpu_data_set_tag (starpu_data_handle_t @var{handle}, int @var{tag})
			
 
				+Tell StarPU-MPI which MPI tag to use when exchanging the data.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_data_get_tag (starpu_data_handle_t @var{handle})
			
 
				+Returns the MPI tag to be used when exchanging the data.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_data_set_rank (starpu_data_handle_t @var{handle}, int @var{rank})
			
 
				+Tell StarPU-MPI which MPI node "owns" a given data, that is, the node which will
			
 
				+always keep an up-to-date value, and will by default execute tasks which write
			
 
				+to it.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_data_get_rank (starpu_data_handle_t @var{handle})
			
 
				+Returns the last value set by @code{starpu_data_set_rank}.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@defmac STARPU_EXECUTE_ON_NODE
			
 
				+this macro is used when calling @code{starpu_mpi_insert_task}, and
			
 
				+must be followed by a integer value which specified the node on which
			
 
				+to execute the codelet.
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_EXECUTE_ON_DATA
			
 
				+this macro is used when calling @code{starpu_mpi_insert_task}, and
			
 
				+must be followed by a data handle to specify that the node owning the
			
 
				+given data will execute the codelet.
			
 
				+@end defmac
			
 
				+
			
 
				+@deftypefun int starpu_mpi_insert_task (MPI_Comm @var{comm}, struct starpu_codelet *@var{codelet}, ...)
			
 
				+Create and submit a task corresponding to @var{codelet} with the following
			
 
				+arguments.  The argument list must be zero-terminated.
			
 
				+
			
 
				+The arguments following the codelets are the same types as for the
			
 
				+function @code{starpu_insert_task} defined in @ref{Insert Task
			
 
				+Utility}. The extra argument @code{STARPU_EXECUTE_ON_NODE} followed by an
			
 
				+integer allows to specify the MPI node to execute the codelet. It is also
			
 
				+possible to specify that the node owning a specific data will execute
			
 
				+the codelet, by using @code{STARPU_EXECUTE_ON_DATA} followed by a data
			
 
				+handle.
			
 
				+
			
 
				+The internal algorithm is as follows:
			
 
				+@enumerate
			
 
				+@item Find out whether we (as an MPI node) are to execute the codelet
			
 
				+because we own the data to be written to. If different nodes own data
			
 
				+to be written to, the argument @code{STARPU_EXECUTE_ON_NODE} or
			
 
				+@code{STARPU_EXECUTE_ON_DATA} has to be used to specify which MPI node will
			
 
				+execute the task.
			
 
				+@item Send and receive data as requested. Nodes owning data which need to be
			
 
				+read by the task are sending them to the MPI node which will execute it. The
			
 
				+latter receives them.
			
 
				+@item Execute the codelet. This is done by the MPI node selected in the
			
 
				+1st step of the algorithm.
			
 
				+@item In the case when different MPI nodes own data to be written to, send
			
 
				+written data back to their owners.
			
 
				+@end enumerate
			
 
				+
			
 
				+The algorithm also includes a cache mechanism that allows not to send
			
 
				+data twice to the same MPI node, unless the data has been modified.
			
 
				+
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_mpi_get_data_on_node (MPI_Comm @var{comm}, starpu_data_handle_t @var{data_handle}, int @var{node})
			
 
				+Transfer data @var{data_handle} to MPI node @var{node}, sending it from its
			
 
				+owner if needed. At least the target node and the owner have to call the
			
 
				+function.
			
 
				+@end deftypefun
			
 
				+
			
 
				+Here an stencil example showing how to use @code{starpu_mpi_insert_task}. One
			
 
				+first needs to define a distribution function which specifies the
			
 
				+locality of the data. Note that that distribution information needs to
			
 
				+be given to StarPU by calling @code{starpu_data_set_rank}.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+/* Returns the MPI node number where data is */
			
 
				+int my_distrib(int x, int y, int nb_nodes) @{
			
 
				+  /* Block distrib */
			
 
				+  return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
			
 
				+
			
 
				+  // /* Other examples useful for other kinds of computations */
			
 
				+  // /* / distrib */
			
 
				+  // return (x+y) % nb_nodes;
			
 
				+
			
 
				+  // /* Block cyclic distrib */
			
 
				+  // unsigned side = sqrt(nb_nodes);
			
 
				+  // return x % side + (y % side) * size;
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Now the data can be registered within StarPU. Data which are not
			
 
				+owned but will be needed for computations can be registered through
			
 
				+the lazy allocation mechanism, i.e. with a @code{home_node} set to -1.
			
 
				+StarPU will automatically allocate the memory when it is used for the
			
 
				+first time.
			
 
				+
			
 
				+One can note an optimization here (the @code{else if} test): we only register
			
 
				+data which will be needed by the tasks that we will execute.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    unsigned matrix[X][Y];
			
 
				+    starpu_data_handle_t data_handles[X][Y];
			
 
				+
			
 
				+    for(x = 0; x < X; x++) @{
			
 
				+        for (y = 0; y < Y; y++) @{
			
 
				+            int mpi_rank = my_distrib(x, y, size);
			
 
				+             if (mpi_rank == my_rank)
			
 
				+                /* Owning data */
			
 
				+                starpu_variable_data_register(&data_handles[x][y], 0,
			
 
				+                                              (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				+            else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				+                  || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				+                /* I don't own that index, but will need it for my computations */
			
 
				+                starpu_variable_data_register(&data_handles[x][y], -1,
			
 
				+                                              (uintptr_t)NULL, sizeof(unsigned));
			
 
				+            else
			
 
				+                /* I know it's useless to allocate anything for this */
			
 
				+                data_handles[x][y] = NULL;
			
 
				+            if (data_handles[x][y])
			
 
				+                starpu_data_set_rank(data_handles[x][y], mpi_rank);
			
 
				+        @}
			
 
				+    @}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Now @code{starpu_mpi_insert_task()} can be called for the different
			
 
				+steps of the application.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+    for(loop=0 ; loop<niter; loop++)
			
 
				+        for (x = 1; x < X-1; x++)
			
 
				+            for (y = 1; y < Y-1; y++)
			
 
				+                starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl,
			
 
				+                                       STARPU_RW, data_handles[x][y],
			
 
				+                                       STARPU_R, data_handles[x-1][y],
			
 
				+                                       STARPU_R, data_handles[x+1][y],
			
 
				+                                       STARPU_R, data_handles[x][y-1],
			
 
				+                                       STARPU_R, data_handles[x][y+1],
			
 
				+                                       0);
			
 
				+    starpu_task_wait_for_all();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+I.e. all MPI nodes process the whole task graph, but as mentioned above, for
			
 
				+each task, only the MPI node which owns the data being written to (here,
			
 
				+@code{data_handles[x][y]}) will actually run the task. The other MPI nodes will
			
 
				+automatically send the required data.
			
 
				+
			
 
				+@node MPI Collective Operations
			
 
				+@section MPI Collective Operations
			
 
				+
			
 
				+@deftypefun int starpu_mpi_scatter_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
			
 
				+Scatter data among processes of the communicator based on the ownership of
			
 
				+the data. For each data of the array @var{data_handles}, the
			
 
				+process @var{root} sends the data to the process owning this data.
			
 
				+Processes receiving data must have valid data handles to receive them.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_mpi_gather_detached (starpu_data_handle_t *@var{data_handles}, int @var{count}, int @var{root}, MPI_Comm @var{comm})
			
 
				+Gather data from the different processes of the communicator onto the
			
 
				+process @var{root}. Each process owning data handle in the array
			
 
				+@var{data_handles} will send them to the process @var{root}. The
			
 
				+process @var{root} must have valid data handles to receive the data.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@page
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+if (rank == root)
			
 
				+@{
			
 
				+    /* Allocate the vector */
			
 
				+    vector = malloc(nblocks * sizeof(float *));
			
 
				+    for(x=0 ; x<nblocks ; x++)
			
 
				+    @{
			
 
				+        starpu_malloc((void **)&vector[x], block_size*sizeof(float));
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Allocate data handles and register data to StarPU */
			
 
				+data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
 
				+for(x = 0; x < nblocks ;  x++)
			
 
				+@{
			
 
				+    int mpi_rank = my_distrib(x, nodes);
			
 
				+    if (rank == root) @{
			
 
				+        starpu_vector_data_register(&data_handles[x], 0, (uintptr_t)vector[x],
			
 
				+                                    blocks_size, sizeof(float));
			
 
				+    @}
			
 
				+    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) @{
			
 
				+        /* I own that index, or i will need it for my computations */
			
 
				+        starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
			
 
				+                                   block_size, sizeof(float));
			
 
				+    @}
			
 
				+    else @{
			
 
				+        /* I know it's useless to allocate anything for this */
			
 
				+        data_handles[x] = NULL;
			
 
				+    @}
			
 
				+    if (data_handles[x]) @{
			
 
				+        starpu_data_set_rank(data_handles[x], mpi_rank);
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Scatter the matrix among the nodes */
			
 
				+starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
			
 
				+
			
 
				+/* Calculation */
			
 
				+for(x = 0; x < nblocks ;  x++) @{
			
 
				+    if (data_handles[x]) @{
			
 
				+        int owner = starpu_data_get_rank(data_handles[x]);
			
 
				+        if (owner == rank) @{
			
 
				+            starpu_insert_task(&cl, STARPU_RW, data_handles[x], 0);
			
 
				+        @}
			
 
				+    @}
			
 
				+@}
			
 
				+
			
 
				+/* Gather the matrix on main node */
			
 
				+starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+
			
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -0,0 +1,429 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* On-line::                     On-line performance feedback
			
 
				+* Off-line::                    Off-line performance feedback
			
 
				+* Codelet performance::         Performance of codelets
			
 
				+* Theoretical lower bound on execution time API::  
			
 
				+@end menu
			
 
				+
			
 
				+@node On-line
			
 
				+@section On-line performance feedback
			
 
				+
			
 
				+@menu
			
 
				+* Enabling monitoring::         Enabling on-line performance monitoring
			
 
				+* Task feedback::               Per-task feedback
			
 
				+* Codelet feedback::            Per-codelet feedback
			
 
				+* Worker feedback::             Per-worker feedback
			
 
				+* Bus feedback::                Bus-related feedback
			
 
				+* StarPU-Top::                  StarPU-Top interface
			
 
				+@end menu
			
 
				+
			
 
				+@node Enabling monitoring
			
 
				+@subsection Enabling on-line performance monitoring
			
 
				+
			
 
				+In order to enable online performance monitoring, the application can call
			
 
				+@code{starpu_profiling_status_set(STARPU_PROFILING_ENABLE)}. It is possible to
			
 
				+detect whether monitoring is already enabled or not by calling
			
 
				+@code{starpu_profiling_status_get()}. Enabling monitoring also reinitialize all
			
 
				+previously collected feedback. The @code{STARPU_PROFILING} environment variable
			
 
				+can also be set to 1 to achieve the same effect.
			
 
				+
			
 
				+Likewise, performance monitoring is stopped by calling
			
 
				+@code{starpu_profiling_status_set(STARPU_PROFILING_DISABLE)}. Note that this
			
 
				+does not reset the performance counters so that the application may consult
			
 
				+them later on.
			
 
				+
			
 
				+More details about the performance monitoring API are available in section
			
 
				+@ref{Profiling API}.
			
 
				+
			
 
				+@node Task feedback
			
 
				+@subsection Per-task feedback
			
 
				+
			
 
				+If profiling is enabled, a pointer to a @code{starpu_task_profiling_info}
			
 
				+structure is put in the @code{.profiling_info} field of the @code{starpu_task}
			
 
				+structure when a task terminates.
			
 
				+This structure is automatically destroyed when the task structure is destroyed,
			
 
				+either automatically or by calling @code{starpu_task_destroy}.
			
 
				+
			
 
				+The @code{starpu_task_profiling_info} structure indicates the date when the
			
 
				+task was submitted (@code{submit_time}), started (@code{start_time}), and
			
 
				+terminated (@code{end_time}), relative to the initialization of
			
 
				+StarPU with @code{starpu_init}. It also specifies the identifier of the worker
			
 
				+that has executed the task (@code{workerid}).
			
 
				+These date are stored as @code{timespec} structures which the user may convert
			
 
				+into micro-seconds using the @code{starpu_timing_timespec_to_us} helper
			
 
				+function.
			
 
				+
			
 
				+It it worth noting that the application may directly access this structure from
			
 
				+the callback executed at the end of the task. The @code{starpu_task} structure
			
 
				+associated to the callback currently being executed is indeed accessible with
			
 
				+the @code{starpu_get_current_task()} function.
			
 
				+
			
 
				+@node Codelet feedback
			
 
				+@subsection Per-codelet feedback
			
 
				+
			
 
				+The @code{per_worker_stats} field of the @code{struct starpu_codelet} structure is
			
 
				+an array of counters. The i-th entry of the array is incremented every time a
			
 
				+task implementing the codelet is executed on the i-th worker.
			
 
				+This array is not reinitialized when profiling is enabled or disabled.
			
 
				+
			
 
				+@node Worker feedback
			
 
				+@subsection Per-worker feedback
			
 
				+
			
 
				+The second argument returned by the @code{starpu_worker_get_profiling_info}
			
 
				+function is a @code{starpu_worker_profiling_info} structure that gives
			
 
				+statistics about the specified worker. This structure specifies when StarPU
			
 
				+started collecting profiling information for that worker (@code{start_time}),
			
 
				+the duration of the profiling measurement interval (@code{total_time}), the
			
 
				+time spent executing kernels (@code{executing_time}), the time spent sleeping
			
 
				+because there is no task to execute at all (@code{sleeping_time}), and the
			
 
				+number of tasks that were executed while profiling was enabled.
			
 
				+These values give an estimation of the proportion of time spent do real work,
			
 
				+and the time spent either sleeping because there are not enough executable
			
 
				+tasks or simply wasted in pure StarPU overhead. 
			
 
				+
			
 
				+Calling @code{starpu_worker_get_profiling_info} resets the profiling
			
 
				+information associated to a worker.
			
 
				+
			
 
				+When an FxT trace is generated (see @ref{Generating traces}), it is also
			
 
				+possible to use the @code{starpu_top} script (described in @ref{starpu-top}) to
			
 
				+generate a graphic showing the evolution of these values during the time, for
			
 
				+the different workers.
			
 
				+
			
 
				+@node Bus feedback
			
 
				+@subsection Bus-related feedback 
			
 
				+
			
 
				+TODO: ajouter STARPU_BUS_STATS
			
 
				+
			
 
				+@c how to enable/disable performance monitoring
			
 
				+
			
 
				+@c what kind of information do we get ?
			
 
				+
			
 
				+The bus speed measured by StarPU can be displayed by using the
			
 
				+@code{starpu_machine_display} tool, for instance:
			
 
				+
			
 
				+@example
			
 
				+StarPU has found:
			
 
				+        3 CUDA devices
			
 
				+                CUDA 0 (Tesla C2050 02:00.0)
			
 
				+                CUDA 1 (Tesla C2050 03:00.0)
			
 
				+                CUDA 2 (Tesla C2050 84:00.0)
			
 
				+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
			
 
				+RAM     0.000000        5176.530428     5176.492994     5191.710722
			
 
				+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
			
 
				+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
			
 
				+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				+@end example
			
 
				+
			
 
				+@node StarPU-Top
			
 
				+@subsection StarPU-Top interface
			
 
				+
			
 
				+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
			
 
				+application and permits the user to change parameters on the fly.
			
 
				+
			
 
				+Variables to be monitored can be registered by calling the
			
 
				+@code{starpu_top_add_data_boolean}, @code{starpu_top_add_data_integer},
			
 
				+@code{starpu_top_add_data_float} functions, e.g.:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The application should then call @code{starpu_top_init_and_wait} to give its name
			
 
				+and wait for StarPU-Top to get a start request from the user. The name is used
			
 
				+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_top_init_and_wait("the application");
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+The new values can then be provided thanks to
			
 
				+@code{starpu_top_update_data_boolean}, @code{starpu_top_update_data_integer},
			
 
				+@code{starpu_top_update_data_float}, e.g.:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_top_update_data_integer(data, mynum);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Updateable parameters can be registered thanks to @code{starpu_top_register_parameter_boolean}, @code{starpu_top_register_parameter_integer}, @code{starpu_top_register_parameter_float}, e.g.:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+float alpha;
			
 
				+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+@code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+void modif_hook(struct starpu_top_param *d) @{
			
 
				+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Task schedulers should notify StarPU-Top when it has decided when a task will be
			
 
				+scheduled, so that it can show it in its Gantt chart, for instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_top_task_prevision(task, workerid, begin, end);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Starting StarPU-Top and the application can be done two ways:
			
 
				+
			
 
				+@itemize
			
 
				+@item The application is started by hand on some machine (and thus already
			
 
				+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
			
 
				+checkbox should be unchecked, and the hostname and port (default is 2011) on
			
 
				+which the application is already running should be specified. Clicking on the
			
 
				+connection button will thus connect to the already-running application.
			
 
				+@item StarPU-Top is started first, and clicking on the connection button will
			
 
				+start the application itself (possibly on a remote machine). The SSH checkbox
			
 
				+should be checked, and a command line provided, e.g.:
			
 
				+
			
 
				+@example
			
 
				+ssh myserver STARPU_SCHED=heft ./application
			
 
				+@end example
			
 
				+
			
 
				+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
			
 
				+
			
 
				+@example
			
 
				+ssh -L 2011:localhost:2011 myserver STARPU_SCHED=heft ./application
			
 
				+@end example
			
 
				+
			
 
				+and "localhost" should be used as IP Address to connect to.
			
 
				+@end itemize
			
 
				+
			
 
				+@node Off-line
			
 
				+@section Off-line performance feedback
			
 
				+
			
 
				+@menu
			
 
				+* Generating traces::           Generating traces with FxT
			
 
				+* Gantt diagram::               Creating a Gantt Diagram
			
 
				+* DAG::                         Creating a DAG with graphviz
			
 
				+* starpu-top::                  Monitoring activity
			
 
				+@end menu
			
 
				+
			
 
				+@node Generating traces
			
 
				+@subsection Generating traces with FxT
			
 
				+
			
 
				+StarPU can use the FxT library (see
			
 
				+@indicateurl{https://savannah.nongnu.org/projects/fkt/}) to generate traces
			
 
				+with a limited runtime overhead.
			
 
				+
			
 
				+You can either get a tarball:
			
 
				+@example
			
 
				+% wget http://download.savannah.gnu.org/releases/fkt/fxt-0.2.2.tar.gz
			
 
				+@end example
			
 
				+
			
 
				+or use the FxT library from CVS (autotools are required):
			
 
				+@example
			
 
				+% cvs -d :pserver:anonymous@@cvs.sv.gnu.org:/sources/fkt co FxT
			
 
				+% ./bootstrap
			
 
				+@end example
			
 
				+
			
 
				+Compiling and installing the FxT library in the @code{$FXTDIR} path is
			
 
				+done following the standard procedure:
			
 
				+@example
			
 
				+% ./configure --prefix=$FXTDIR
			
 
				+% make
			
 
				+% make install
			
 
				+@end example
			
 
				+
			
 
				+In order to have StarPU to generate traces, StarPU should be configured with
			
 
				+the @code{--with-fxt} option:
			
 
				+@example
			
 
				+$ ./configure --with-fxt=$FXTDIR
			
 
				+@end example
			
 
				+
			
 
				+Or you can simply point the @code{PKG_CONFIG_PATH} to
			
 
				+@code{$FXTDIR/lib/pkgconfig} and pass @code{--with-fxt} to @code{./configure}
			
 
				+
			
 
				+When FxT is enabled, a trace is generated when StarPU is terminated by calling
			
 
				+@code{starpu_shutdown()}). The trace is a binary file whose name has the form
			
 
				+@code{prof_file_XXX_YYY} where @code{XXX} is the user name, and
			
 
				+@code{YYY} is the pid of the process that used StarPU. This file is saved in the
			
 
				+@code{/tmp/} directory by default, or by the directory specified by
			
 
				+the @code{STARPU_FXT_PREFIX} environment variable.
			
 
				+
			
 
				+@node Gantt diagram
			
 
				+@subsection Creating a Gantt Diagram
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate a trace in the Paje format by calling:
			
 
				+@example
			
 
				+% starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+Or alternatively, setting the @code{STARPU_GENERATE_TRACE} environment variable
			
 
				+to 1 before application execution will make StarPU do it automatically at
			
 
				+application shutdown.
			
 
				+
			
 
				+This will create a @code{paje.trace} file in the current directory that can be
			
 
				+inspected with the ViTE trace visualizing open-source tool. More information
			
 
				+about ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
			
 
				+possible to open the @code{paje.trace} file with ViTE by using the following
			
 
				+command:
			
 
				+@example
			
 
				+% vite paje.trace
			
 
				+@end example
			
 
				+
			
 
				+@node DAG
			
 
				+@subsection Creating a DAG with graphviz
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate a task graph in the DOT format by calling:
			
 
				+@example
			
 
				+$ starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+This will create a @code{dag.dot} file in the current directory. This file is a
			
 
				+task graph described using the DOT language. It is possible to get a
			
 
				+graphical output of the graph by using the graphviz library:
			
 
				+@example
			
 
				+$ dot -Tpdf dag.dot -o output.pdf
			
 
				+@end example
			
 
				+
			
 
				+@node starpu-top
			
 
				+@subsection Monitoring activity
			
 
				+
			
 
				+When the FxT trace file @code{filename} has been generated, it is possible to
			
 
				+generate an activity trace by calling:
			
 
				+@example
			
 
				+$ starpu_fxt_tool -i filename
			
 
				+@end example
			
 
				+
			
 
				+This will create an @code{activity.data} file in the current
			
 
				+directory. A profile of the application showing the activity of StarPU
			
 
				+during the execution of the program can be generated:
			
 
				+@example
			
 
				+$ starpu_top activity.data
			
 
				+@end example
			
 
				+
			
 
				+This will create a file named @code{activity.eps} in the current directory.
			
 
				+This picture is composed of two parts.
			
 
				+The first part shows the activity of the different workers. The green sections
			
 
				+indicate which proportion of the time was spent executed kernels on the
			
 
				+processing unit. The red sections indicate the proportion of time spent in
			
 
				+StartPU: an important overhead may indicate that the granularity may be too
			
 
				+low, and that bigger tasks may be appropriate to use the processing unit more
			
 
				+efficiently. The black sections indicate that the processing unit was blocked
			
 
				+because there was no task to process: this may indicate a lack of parallelism
			
 
				+which may be alleviated by creating more tasks when it is possible.
			
 
				+
			
 
				+The second part of the @code{activity.eps} picture is a graph showing the
			
 
				+evolution of the number of tasks available in the system during the execution.
			
 
				+Ready tasks are shown in black, and tasks that are submitted but not
			
 
				+schedulable yet are shown in grey.
			
 
				+
			
 
				+@node Codelet performance
			
 
				+@section Performance of codelets
			
 
				+
			
 
				+The performance model of codelets (described in @ref{Performance model example}) can be examined by using the
			
 
				+@code{starpu_perfmodel_display} tool:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -l
			
 
				+file: <malloc_pinned.hannibal>
			
 
				+file: <starpu_slu_lu_model_21.hannibal>
			
 
				+file: <starpu_slu_lu_model_11.hannibal>
			
 
				+file: <starpu_slu_lu_model_22.hannibal>
			
 
				+file: <starpu_slu_lu_model_12.hannibal>
			
 
				+@end example
			
 
				+
			
 
				+Here, the codelets of the lu example are available. We can examine the
			
 
				+performance of the 22 kernel (in micro-seconds):
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -s starpu_slu_lu_model_22
			
 
				+performance model for cpu
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   2.851069e+05  1.829369e+04  109
			
 
				+performance model for cuda_0
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.164144e+04  1.556094e+01  315
			
 
				+performance model for cuda_1
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.164271e+04  1.330628e+01  360
			
 
				+performance model for cuda_2
			
 
				+# hash      size       mean          dev           n
			
 
				+57618ab0    19660800   1.166730e+04  3.390395e+02  456
			
 
				+@end example
			
 
				+
			
 
				+We can see that for the given size, over a sample of a few hundreds of
			
 
				+execution, the GPUs are about 20 times faster than the CPUs (numbers are in
			
 
				+us). The standard deviation is extremely low for the GPUs, and less than 10% for
			
 
				+CPUs.
			
 
				+
			
 
				+The @code{starpu_regression_display} tool does the same for regression-based
			
 
				+performance models. It also writes a @code{.gp} file in the current directory,
			
 
				+to be run in the @code{gnuplot} tool, which shows the corresponding curve.
			
 
				+
			
 
				+The same can also be achieved by using StarPU's library API, see
			
 
				+@ref{Performance Model API} and notably the @code{starpu_load_history_debug}
			
 
				+function. The source code of the @code{starpu_perfmodel_display} tool can be a
			
 
				+useful example.
			
 
				+
			
 
				+@node Theoretical lower bound on execution time API
			
 
				+@section Theoretical lower bound on execution time
			
 
				+
			
 
				+See @ref{Theoretical lower bound on execution time} for an example on how to use
			
 
				+this API. It permits to record a trace of what tasks are needed to complete the
			
 
				+application, and then, by using a linear system, provide a theoretical lower
			
 
				+bound of the execution time (i.e. with an ideal scheduling).
			
 
				+
			
 
				+The computed bound is not really correct when not taking into account
			
 
				+dependencies, but for an application which have enough parallelism, it is very
			
 
				+near to the bound computed with dependencies enabled (which takes a huge lot
			
 
				+more time to compute), and thus provides a good-enough estimation of the ideal
			
 
				+execution time.
			
 
				+
			
 
				+@deftypefun void starpu_bound_start (int @var{deps}, int @var{prio})
			
 
				+Start recording tasks (resets stats).  @var{deps} tells whether
			
 
				+dependencies should be recorded too (this is quite expensive)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_stop (void)
			
 
				+Stop recording tasks
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_print_dot ({FILE *}@var{output})
			
 
				+Print the DAG that was recorded
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_compute ({double *}@var{res}, {double *}@var{integer_res}, int @var{integer})
			
 
				+Get theoretical upper bound (in ms) (needs glpk support detected by @code{configure} script)
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_print_lp ({FILE *}@var{output})
			
 
				+Emit the Linear Programming system on @var{output} for the recorded tasks, in
			
 
				+the lp format
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_print_mps ({FILE *}@var{output})
			
 
				+Emit the Linear Programming system on @var{output} for the recorded tasks, in
			
 
				+the mps format
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_bound_print ({FILE *}@var{output}, int @var{integer})
			
 
				+Emit statistics of actual execution vs theoretical upper bound. @var{integer}
			
 
				+permits to choose between integer solving (which takes a long time but is
			
 
				+correct), and relaxed solving (which provides an approximate solution).
			
 
				+@end deftypefun
			
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -0,0 +1,331 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+TODO: improve!
			
 
				+
			
 
				+@menu
			
 
				+* Data management::
			
 
				+* Task granularity::
			
 
				+* Task submission::
			
 
				+* Task priorities::
			
 
				+* Task scheduling policy::
			
 
				+* Performance model calibration::
			
 
				+* Task distribution vs Data transfer::
			
 
				+* Data prefetch::
			
 
				+* Power-based scheduling::
			
 
				+* Profiling::
			
 
				+* CUDA-specific optimizations::
			
 
				+* Performance debugging::
			
 
				+@end menu
			
 
				+
			
 
				+Simply encapsulating application kernels into tasks already permits to
			
 
				+seamlessly support CPU and GPUs at the same time. To achieve good performance, a
			
 
				+few additional changes are needed.
			
 
				+
			
 
				+@node Data management
			
 
				+@section Data management
			
 
				+
			
 
				+When the application allocates data, whenever possible it should use the
			
 
				+@code{starpu_malloc} function, which will ask CUDA or
			
 
				+OpenCL to make the allocation itself and pin the corresponding allocated
			
 
				+memory. This is needed to permit asynchronous data transfer, i.e. permit data
			
 
				+transfer to overlap with computations. Otherwise, the trace will show that the
			
 
				+@code{DriverCopyAsync} state takes a lot of time, this is because CUDA or OpenCL
			
 
				+then reverts to synchronous transfers.
			
 
				+
			
 
				+By default, StarPU leaves replicates of data wherever they were used, in case they
			
 
				+will be re-used by other tasks, thus saving the data transfer time. When some
			
 
				+task modifies some data, all the other replicates are invalidated, and only the
			
 
				+processing unit which ran that task will have a valid replicate of the data. If the application knows
			
 
				+that this data will not be re-used by further tasks, it should advise StarPU to
			
 
				+immediately replicate it to a desired list of memory nodes (given through a
			
 
				+bitmask). This can be understood like the write-through mode of CPU caches.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_data_set_wt_mask(img_handle, 1<<0);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+will for instance request to always automatically transfer a replicate into the
			
 
				+main memory (node 0), as bit 0 of the write-through bitmask is being set.
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_data_set_wt_mask(img_handle, ~0U);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+will request to always automatically broadcast the updated data to all memory
			
 
				+nodes.
			
 
				+
			
 
				+Setting the write-through mask to @code{~0U} can also be useful to make sure all
			
 
				+memory nodes always have a copy of the data, so that it is never evicted when
			
 
				+memory gets scarse.
			
 
				+
			
 
				+Implicit data dependency computation can become expensive if a lot
			
 
				+of tasks access the same piece of data. If no dependency is required
			
 
				+on some piece of data (e.g. because it is only accessed in read-only
			
 
				+mode, or because write accesses are actually commutative), use the
			
 
				+@code{starpu_data_set_sequential_consistency_flag} function to disable implicit
			
 
				+dependencies on that data.
			
 
				+
			
 
				+@node Task granularity
			
 
				+@section Task granularity
			
 
				+
			
 
				+Like any other runtime, StarPU has some overhead to manage tasks. Since
			
 
				+it does smart scheduling and data management, that overhead is not always
			
 
				+neglectable. The order of magnitude of the overhead is typically a couple of
			
 
				+microseconds. The amount of work that a task should do should thus be somewhat
			
 
				+bigger, to make sure that the overhead becomes neglectible. The offline
			
 
				+performance feedback can provide a measure of task length, which should thus be
			
 
				+checked if bad performance are observed.
			
 
				+
			
 
				+@node Task submission
			
 
				+@section Task submission
			
 
				+
			
 
				+To let StarPU make online optimizations, tasks should be submitted
			
 
				+asynchronously as much as possible. Ideally, all the tasks should be
			
 
				+submitted, and mere calls to @code{starpu_task_wait_for_all} or
			
 
				+@code{starpu_data_unregister} be done to wait for
			
 
				+termination. StarPU will then be able to rework the whole schedule, overlap
			
 
				+computation with communication, manage accelerator local memory usage, etc.
			
 
				+
			
 
				+@node Task priorities
			
 
				+@section Task priorities
			
 
				+
			
 
				+By default, StarPU will consider the tasks in the order they are submitted by
			
 
				+the application. If the application programmer knows that some tasks should
			
 
				+be performed in priority (for instance because their output is needed by many
			
 
				+other tasks and may thus be a bottleneck if not executed early enough), the
			
 
				+@code{priority} field of the task structure should be set to transmit the
			
 
				+priority information to StarPU.
			
 
				+
			
 
				+@node Task scheduling policy
			
 
				+@section Task scheduling policy
			
 
				+
			
 
				+By default, StarPU uses the @code{eager} simple greedy scheduler. This is
			
 
				+because it provides correct load balance even if the application codelets do not
			
 
				+have performance models. If your application codelets have performance models
			
 
				+(@pxref{Performance model example} for examples showing how to do it),
			
 
				+you should change the scheduler thanks to the @code{STARPU_SCHED} environment
			
 
				+variable. For instance @code{export STARPU_SCHED=dmda} . Use @code{help} to get
			
 
				+the list of available schedulers.
			
 
				+
			
 
				+The @b{eager} scheduler uses a central task queue, from which workers draw tasks
			
 
				+to work on. This however does not permit to prefetch data since the scheduling
			
 
				+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				+
			
 
				+The @b{prio} scheduler also uses a central task queue, but sorts tasks by
			
 
				+priority (between -5 and 5).
			
 
				+
			
 
				+The @b{random} scheduler distributes tasks randomly according to assumed worker
			
 
				+overall performance.
			
 
				+
			
 
				+The @b{ws} (work stealing) scheduler schedules tasks on the local worker by
			
 
				+default. When a worker becomes idle, it steals a task from the most loaded
			
 
				+worker.
			
 
				+
			
 
				+The @b{dm} (deque model) scheduler uses task execution performance models into account to
			
 
				+perform an HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				+termination time will be minimal.
			
 
				+
			
 
				+The @b{dmda} (deque model data aware) scheduler is similar to dm, it also takes
			
 
				+into account data transfer time.
			
 
				+
			
 
				+The @b{dmdar} (deque model data aware ready) scheduler is similar to dmda,
			
 
				+it also sorts tasks on per-worker queues by number of already-available data
			
 
				+buffers.
			
 
				+
			
 
				+The @b{dmdas} (deque model data aware sorted) scheduler is similar to dmda, it
			
 
				+also supports arbitrary priority values.
			
 
				+
			
 
				+The @b{heft} (heterogeneous earliest finish time) scheduler is similar to dmda, it also supports task bundles.
			
 
				+
			
 
				+The @b{pheft} (parallel HEFT) scheduler is similar to heft, it also supports
			
 
				+parallel tasks (still experimental).
			
 
				+
			
 
				+The @b{pgreedy} (parallel greedy) scheduler is similar to greedy, it also
			
 
				+supports parallel tasks (still experimental).
			
 
				+
			
 
				+@node Performance model calibration
			
 
				+@section Performance model calibration
			
 
				+
			
 
				+Most schedulers are based on an estimation of codelet duration on each kind
			
 
				+of processing unit. For this to be possible, the application programmer needs
			
 
				+to configure a performance model for the codelets of the application (see
			
 
				+@ref{Performance model example} for instance). History-based performance models
			
 
				+use on-line calibration.  StarPU will automatically calibrate codelets
			
 
				+which have never been calibrated yet, and save the result in
			
 
				+@code{~/.starpu/sampling/codelets}.
			
 
				+The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use @code{export STARPU_HOSTNAME=some_global_name}. To force continuing calibration, use
			
 
				+@code{export STARPU_CALIBRATE=1} . This may be necessary if your application
			
 
				+has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				+the current result) until 10 (_STARPU_CALIBRATION_MINIMUM) measurements have been
			
 
				+made on each architecture, to avoid badly scheduling tasks just because the
			
 
				+first measurements were not so good. Details on the current performance model status
			
 
				+can be obtained from the @code{starpu_perfmodel_display} command: the @code{-l}
			
 
				+option lists the available performance models, and the @code{-s} option permits
			
 
				+to choose the performance model to be displayed. The result looks like:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_display -s starpu_dlu_lu_model_22
			
 
				+performance model for cpu
			
 
				+# hash    size     mean          dev           n
			
 
				+880805ba  98304    2.731309e+02  6.010210e+01  1240
			
 
				+b50b6605  393216   1.469926e+03  1.088828e+02  1240
			
 
				+5c6c3401  1572864  1.125983e+04  3.265296e+03  1240
			
 
				+@end example
			
 
				+
			
 
				+Which shows that for the LU 22 kernel with a 1.5MiB matrix, the average
			
 
				+execution time on CPUs was about 11ms, with a 3ms standard deviation, over
			
 
				+1240 samples. It is a good idea to check this before doing actual performance
			
 
				+measurements.
			
 
				+
			
 
				+A graph can be drawn by using the @code{starpu_perfmodel_plot}:
			
 
				+
			
 
				+@example
			
 
				+$ starpu_perfmodel_plot -s starpu_dlu_lu_model_22
			
 
				+98304 393216 1572864 
			
 
				+$ gnuplot starpu_starpu_dlu_lu_model_22.gp
			
 
				+$ gv starpu_starpu_dlu_lu_model_22.eps
			
 
				+@end example
			
 
				+
			
 
				+If a kernel source code was modified (e.g. performance improvement), the
			
 
				+calibration information is stale and should be dropped, to re-calibrate from
			
 
				+start. This can be done by using @code{export STARPU_CALIBRATE=2}.
			
 
				+
			
 
				+Note: due to CUDA limitations, to be able to measure kernel duration,
			
 
				+calibration mode needs to disable asynchronous data transfers. Calibration thus
			
 
				+disables data transfer / computation overlapping, and should thus not be used
			
 
				+for eventual benchmarks. Note 2: history-based performance models get calibrated
			
 
				+only if a performance-model-based scheduler is chosen.
			
 
				+
			
 
				+@node Task distribution vs Data transfer
			
 
				+@section Task distribution vs Data transfer
			
 
				+
			
 
				+Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				+thus needs to find a balance between both. The target function that the
			
 
				+@code{dmda} scheduler of StarPU
			
 
				+tries to minimize is @code{alpha * T_execution + beta * T_data_transfer}, where
			
 
				+@code{T_execution} is the estimated execution time of the codelet (usually
			
 
				+accurate), and @code{T_data_transfer} is the estimated data transfer time. The
			
 
				+latter is estimated based on bus calibration before execution start,
			
 
				+i.e. with an idle machine, thus without contention. You can force bus re-calibration by running
			
 
				+@code{starpu_calibrate_bus}. The beta parameter defaults to 1, but it can be
			
 
				+worth trying to tweak it by using @code{export STARPU_SCHED_BETA=2} for instance,
			
 
				+since during real application execution, contention makes transfer times bigger.
			
 
				+This is of course imprecise, but in practice, a rough estimation already gives
			
 
				+the good results that a precise estimation would give.
			
 
				+
			
 
				+@node Data prefetch
			
 
				+@section Data prefetch
			
 
				+
			
 
				+The @code{heft}, @code{dmda} and @code{pheft} scheduling policies perform data prefetch (see @ref{STARPU_PREFETCH}):
			
 
				+as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				+transfer its required data to the target processing unit, if needeed, so that
			
 
				+when the processing unit actually starts the task, its data will hopefully be
			
 
				+already available and it will not have to wait for the transfer to finish.
			
 
				+
			
 
				+The application may want to perform some manual prefetching, for several reasons
			
 
				+such as excluding initial data transfers from performance measurements, or
			
 
				+setting up an initial statically-computed data distribution on the machine
			
 
				+before submitting tasks, which will thus guide StarPU toward an initial task
			
 
				+distribution (since StarPU will try to avoid further transfers).
			
 
				+
			
 
				+This can be achieved by giving the @code{starpu_data_prefetch_on_node} function
			
 
				+the handle and the desired target memory node.
			
 
				+
			
 
				+@node Power-based scheduling
			
 
				+@section Power-based scheduling
			
 
				+
			
 
				+If the application can provide some power performance model (through
			
 
				+the @code{power_model} field of the codelet structure), StarPU will
			
 
				+take it into account when distributing tasks. The target function that
			
 
				+the @code{dmda} scheduler minimizes becomes @code{alpha * T_execution +
			
 
				+beta * T_data_transfer + gamma * Consumption} , where @code{Consumption}
			
 
				+is the estimated task consumption in Joules. To tune this parameter, use
			
 
				+@code{export STARPU_SCHED_GAMMA=3000} for instance, to express that each Joule
			
 
				+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
			
 
				+@code{alpha} and @code{beta} to zero permits to only take into account power consumption.
			
 
				+
			
 
				+This is however not sufficient to correctly optimize power: the scheduler would
			
 
				+simply tend to run all computations on the most energy-conservative processing
			
 
				+unit. To account for the consumption of the whole machine (including idle
			
 
				+processing units), the idle power of the machine should be given by setting
			
 
				+@code{export STARPU_IDLE_POWER=200} for 200W, for instance. This value can often
			
 
				+be obtained from the machine power supplier.
			
 
				+
			
 
				+The power actually consumed by the total execution can be displayed by setting
			
 
				+@code{export STARPU_PROFILING=1 STARPU_WORKER_STATS=1} .
			
 
				+
			
 
				+@node Profiling
			
 
				+@section Profiling
			
 
				+
			
 
				+A quick view of how many tasks each worker has executed can be obtained by setting 
			
 
				+@code{export STARPU_WORKER_STATS=1} This is a convenient way to check that
			
 
				+execution did happen on accelerators without penalizing performance with
			
 
				+the profiling overhead.
			
 
				+
			
 
				+A quick view of how much data transfers have been issued can be obtained by setting 
			
 
				+@code{export STARPU_BUS_STATS=1} .
			
 
				+
			
 
				+More detailed profiling information can be enabled by using @code{export STARPU_PROFILING=1} or by
			
 
				+calling @code{starpu_profiling_status_set} from the source code.
			
 
				+Statistics on the execution can then be obtained by using @code{export
			
 
				+STARPU_BUS_STATS=1} and @code{export STARPU_WORKER_STATS=1} .
			
 
				+ More details on performance feedback are provided by the next chapter.
			
 
				+
			
 
				+@node CUDA-specific optimizations
			
 
				+@section CUDA-specific optimizations
			
 
				+
			
 
				+Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				+communications and the codelet computations if the application does not use a
			
 
				+dedicated CUDA stream for its computations. StarPU provides one by the use of
			
 
				+@code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
			
 
				+operations. For instance:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+StarPU already does appropriate calls for the CUBLAS library.
			
 
				+
			
 
				+Unfortunately, some CUDA libraries do not have stream variants of
			
 
				+kernels. That will lower the potential for overlapping.
			
 
				+
			
 
				+@node Performance debugging
			
 
				+@section Performance debugging
			
 
				+
			
 
				+To get an idea of what is happening, a lot of performance feedback is available,
			
 
				+detailed in the next chapter. The various informations should be checked for.
			
 
				+
			
 
				+@itemize
			
 
				+@item What does the Gantt diagram look like? (see @ref{Gantt diagram})
			
 
				+@itemize
			
 
				+  @item If it's mostly green (running tasks), then the machine is properly
			
 
				+  utilized, and perhaps the codelets are just slow. Check their performance, see
			
 
				+  @ref{Codelet performance}.
			
 
				+  @item If it's mostly purple (FetchingInput), tasks keep waiting for data
			
 
				+  transfers, do you perhaps have far more communication than computation? Did
			
 
				+  you properly use CUDA streams to make sure communication can be
			
 
				+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
			
 
				+  much as possible?
			
 
				+  @item If it's mostly red (Blocked), tasks keep waiting for dependencies,
			
 
				+  do you have enough parallelism? It might be a good idea to check what the DAG
			
 
				+  looks like (see @ref{DAG}).
			
 
				+  @item If only some workers are completely red (Blocked), for some reason the
			
 
				+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
			
 
				+  check it (see @ref{Codelet performance}). Do all your codelets have a
			
 
				+  performance model?  When some of them don't, the schedulers switches to a
			
 
				+  greedy algorithm which thus performs badly.
			
 
				+@end itemize
			
 
				+@end itemize
			
--- a/doc/chapters/scaling-vector-example.texi
+++ b/doc/chapters/scaling-vector-example.texi
@@ -0,0 +1,48 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Main application::            
			
 
				+* CPU Kernel::                 
			
 
				+* CUDA Kernel::                
			
 
				+* OpenCL Kernel::              
			
 
				+@end menu
			
 
				+
			
 
				+@node Main application
			
 
				+@section Main application
			
 
				+
			
 
				+@include chapters/vector_scal_c.texi
			
 
				+
			
 
				+@node CPU Kernel
			
 
				+@section CPU Kernel
			
 
				+
			
 
				+@include chapters/vector_scal_cpu.texi
			
 
				+
			
 
				+@node CUDA Kernel
			
 
				+@section CUDA Kernel
			
 
				+
			
 
				+@include chapters/vector_scal_cuda.texi
			
 
				+
			
 
				+@node OpenCL Kernel
			
 
				+@section OpenCL Kernel
			
 
				+
			
 
				+@menu
			
 
				+* Invoking the kernel::         
			
 
				+* Source of the kernel::        
			
 
				+@end menu
			
 
				+
			
 
				+@node Invoking the kernel
			
 
				+@subsection Invoking the kernel
			
 
				+
			
 
				+@include chapters/vector_scal_opencl.texi
			
 
				+
			
 
				+@node Source of the kernel
			
 
				+@subsection Source of the kernel
			
 
				+
			
 
				+@include chapters/vector_scal_opencl_codelet.texi
			
 
				+
			
--- a/doc/chapters/socl.texi
+++ b/doc/chapters/socl.texi
@@ -0,0 +1,25 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+SOCL is an extension that aims at implementing the OpenCL standard on
			
 
				+top of StarPU. It allows to gives a (relatively) clean and
			
 
				+standardized API to StarPU.
			
 
				+By allowing OpenCL applications to use StarPU transparently, it
			
 
				+provides users with the latest StarPU enhancements without any further
			
 
				+development, and allows these OpenCL applications to easily fall back
			
 
				+to another OpenCL implementation.
			
 
				+
			
 
				+This section does not require detailed knowledge of the StarPU
			
 
				+library.
			
 
				+
			
 
				+Note: as of StarPU @value{VERSION}, this is still an area under
			
 
				+development and subject to change.
			
 
				+
			
 
				+TODO
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/doc/chapters/tips-tricks.texi
+++ b/doc/chapters/tips-tricks.texi
@@ -0,0 +1,79 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Per-worker library initialization::  How to initialize a computation library once for each worker?
			
 
				+@end menu
			
 
				+
			
 
				+@node Per-worker library initialization
			
 
				+@section How to initialize a computation library once for each worker?
			
 
				+
			
 
				+Some libraries need to be initialized once for each concurrent instance that
			
 
				+may run on the machine. For instance, a C++ computation class which is not
			
 
				+thread-safe by itself, but for which several instanciated objects of that class
			
 
				+can be used concurrently. This can be used in StarPU by initializing one such
			
 
				+object per worker. For instance, the libstarpufft example does the following to
			
 
				+be able to use FFTW.
			
 
				+
			
 
				+Some global array stores the instanciated objects:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+fftw_plan plan_cpu[STARPU_NMAXWORKERS];
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+At initialisation time of libstarpu, the objects are initialized:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int workerid;
			
 
				+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) @{
			
 
				+    switch (starpu_worker_get_type(workerid)) @{
			
 
				+        case STARPU_CPU_WORKER:
			
 
				+            plan_cpu[workerid] = fftw_plan(...);
			
 
				+            break;
			
 
				+    @}
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+And in the codelet body, they are used:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static void fft(void *descr[], void *_args)
			
 
				+@{
			
 
				+    int workerid = starpu_worker_get_id();
			
 
				+    fftw_plan plan = plan_cpu[workerid];
			
 
				+    ...
			
 
				+
			
 
				+    fftw_execute(plan, ...);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+Another way to go which may be needed is to execute some code from the workers
			
 
				+themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
			
 
				+by CUDA to behave properly due to threading issues. For instance, StarPU's
			
 
				+@code{starpu_helper_cublas_init} looks like the following to call
			
 
				+@code{cublasInit} from the workers themselves:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
			
 
				+@{
			
 
				+    cublasStatus cublasst = cublasInit();
			
 
				+    cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				+@}
			
 
				+void starpu_helper_cublas_init(void)
			
 
				+@{
			
 
				+    starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+@end cartouche
			
--- a/doc/chapters/using.texi
+++ b/doc/chapters/using.texi
@@ -0,0 +1,113 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@menu
			
 
				+* Setting flags for compiling and linking applications::  
			
 
				+* Running a basic StarPU application::  
			
 
				+* Kernel threads started by StarPU::
			
 
				+* Enabling OpenCL::
			
 
				+@end menu
			
 
				+
			
 
				+@node Setting flags for compiling and linking applications
			
 
				+@section Setting flags for compiling and linking applications
			
 
				+
			
 
				+Compiling and linking an application against StarPU may require to use
			
 
				+specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
			
 
				+To this end, it is possible to use the @code{pkg-config} tool.
			
 
				+
			
 
				+If StarPU was not installed at some standard location, the path of StarPU's
			
 
				+library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
			
 
				+that @code{pkg-config} can find it. For example if StarPU was installed in
			
 
				+@code{$prefix_dir}:
			
 
				+
			
 
				+@example
			
 
				+% PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
			
 
				+@end example
			
 
				+
			
 
				+The flags required to compile or link against StarPU are then
			
 
				+accessible with the following commands@footnote{It is still possible to use the API
			
 
				+provided in the version 0.9 of StarPU by calling @code{pkg-config}
			
 
				+with the @code{libstarpu} package. Similar packages are provided for
			
 
				+@code{libstarpumpi} and @code{libstarpufft}.}:
			
 
				+
			
 
				+@example
			
 
				+% pkg-config --cflags starpu-1.0  # options for the compiler
			
 
				+% pkg-config --libs starpu-1.0    # options for the linker
			
 
				+@end example
			
 
				+
			
 
				+Also pass the @code{--static} option if the application is to be
			
 
				+linked statically.
			
 
				+
			
 
				+@node Running a basic StarPU application
			
 
				+@section Running a basic StarPU application
			
 
				+
			
 
				+Basic examples using StarPU are built in the directory
			
 
				+@code{examples/basic_examples/} (and installed in
			
 
				+@code{$prefix_dir/lib/starpu/examples/}). You can for example run the example
			
 
				+@code{vector_scal}.
			
 
				+
			
 
				+@example
			
 
				+% ./examples/basic_examples/vector_scal
			
 
				+BEFORE: First element was 1.000000
			
 
				+AFTER: First element is 3.140000
			
 
				+%
			
 
				+@end example
			
 
				+
			
 
				+When StarPU is used for the first time, the directory
			
 
				+@code{$STARPU_HOME/.starpu/} is created, performance models will be stored in
			
 
				+that directory (@code{STARPU_HOME} defaults to @code{$HOME})
			
 
				+
			
 
				+Please note that buses are benchmarked when StarPU is launched for the
			
 
				+first time. This may take a few minutes, or less if @code{hwloc} is
			
 
				+installed. This step is done only once per user and per machine.
			
 
				+
			
 
				+@node Kernel threads started by StarPU
			
 
				+@section Kernel threads started by StarPU
			
 
				+
			
 
				+StarPU automatically binds one thread per CPU core. It does not use
			
 
				+SMT/hyperthreading because kernels are usually already optimized for using a
			
 
				+full core, and using hyperthreading would make kernel calibration rather random.
			
 
				+
			
 
				+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
			
 
				+
			
 
				+While StarPU tasks are executing, the application is not supposed to do
			
 
				+computations in the threads it starts itself, tasks should be used instead.
			
 
				+
			
 
				+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
			
 
				+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
			
 
				+
			
 
				+@node Enabling OpenCL
			
 
				+@section Enabling OpenCL
			
 
				+
			
 
				+When both CUDA and OpenCL drivers are enabled, StarPU will launch an
			
 
				+OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
			
 
				+This design choice was necessary as OpenCL and CUDA can not run at the
			
 
				+same time on the same NVIDIA GPU, as there is currently no interoperability
			
 
				+between them.
			
 
				+
			
 
				+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
			
 
				+
			
 
				+@example
			
 
				+% ./configure --disable-cuda
			
 
				+@end example
			
 
				+
			
 
				+or when running applications:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=0 ./application
			
 
				+@end example
			
 
				+
			
 
				+OpenCL will automatically be started on any device not yet used by
			
 
				+CUDA. So on a machine running 4 GPUS, it is therefore possible to
			
 
				+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
			
 
				+so:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=2 ./application
			
 
				+@end example
			
 
				+
			
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -1,8 +1,16 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@smallexample
			
 
				 /*
			
 
				  * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				  * It shows how to manipulate data with StarPU's data management library.
			
 
				  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				- *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				+ *  2- how to describe which data are accessed by a task (task->handles[0])
			
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 #include <starpu.h>
			
@@ -11,22 +19,24 @@
 
				 #define    NX    2048
			
 
				 
			
 
				 extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_sse_func(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				-static starpu_codelet cl = @{
			
 
				+static struct starpu_codelet cl = @{
			
 
				     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				     /* CPU implementation of the codelet */
			
 
				-    .cpu_func = scal_cpu_func,
			
 
				+    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				     /* CUDA implementation of the codelet */
			
 
				-    .cuda_func = scal_cuda_func,
			
 
				+    .cuda_funcs = @{ scal_cuda_func, NULL @},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				     /* OpenCL implementation of the codelet */
			
 
				-    .opencl_func = scal_opencl_func,
			
 
				+    .opencl_funcs = @{ scal_opencl_func, NULL @},
			
 
				 #endif
			
 
				-    .nbuffers = 1
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				 @};
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -42,7 +52,7 @@ int main(int argc, char **argv)
 
				     for (i = 0; i < NX; i++)
			
 
				         vector[i] = 1.0f;
			
 
				 
			
 
				-    fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
			
 
				+    fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
			
 
				 
			
 
				     /* Initialize StarPU with default configuration */
			
 
				     starpu_init(NULL);
			
@@ -65,7 +75,7 @@ int main(int argc, char **argv)
 
				      *  - the fourth argument is the number of elements in the vector
			
 
				      *  - the fifth argument is the size of each element.
			
 
				      */
			
 
				-    starpu_data_handle vector_handle;
			
 
				+    starpu_data_handle_t vector_handle;
			
 
				     starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				                                 NX, sizeof(vector[0]));
			
 
				 
			
@@ -79,8 +89,7 @@ int main(int argc, char **argv)
 
				     task->cl = &cl;
			
 
				 
			
 
				     /* the codelet manipulates one buffer in RW mode */
			
 
				-    task->buffers[0].handle = vector_handle;
			
 
				-    task->buffers[0].mode = STARPU_RW;
			
 
				+    task->handles[0] = vector_handle;
			
 
				 
			
 
				     /* an argument is passed to the codelet, beware that this is a
			
 
				      * READ-ONLY buffer and that the codelet may be given a pointer to a
			
@@ -106,3 +115,4 @@ int main(int argc, char **argv)
 
				 
			
 
				     return 0;
			
 
				 @}
			
 
				+@end smallexample
			
--- a/doc/chapters/vector_scal_cpu.texi
+++ b/doc/chapters/vector_scal_cpu.texi
@@ -0,0 +1,68 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@smallexample
			
 
				+#include <starpu.h>
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+/* This kernel takes a buffer and scales it by a constant factor */
			
 
				+void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    unsigned i;
			
 
				+    float *factor = cl_arg;
			
 
				+
			
 
				+    /*
			
 
				+     * The "buffers" array matches the task->handles array: for instance
			
 
				+     * task->handles[0] is a handle that corresponds to a data with
			
 
				+     * vector "interface", so that the first entry of the array in the
			
 
				+     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				+     * struct starpu_vector_interface *). Here, we therefore manipulate
			
 
				+     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				+     * in the array, ptr gives the location of the array (that was possibly
			
 
				+     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				+     */
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+
			
 
				+    /* length of the vector */
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+
			
 
				+    /* get a pointer to the local copy of the vector: note that we have to
			
 
				+     * cast it in (float *) since a vector could contain any type of
			
 
				+     * elements so that the .ptr field is actually a uintptr_t */
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+    /* scale the vector */
			
 
				+    for (i = 0; i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+@}
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+@{
			
 
				+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    unsigned int n_iterations = n/4;
			
 
				+
			
 
				+    __m128 *VECTOR = (__m128*) vector;
			
 
				+    __m128 FACTOR __attribute__((aligned(16)));
			
 
				+    float factor = *(float *) cl_arg;
			
 
				+    FACTOR = _mm_set1_ps(factor);
			
 
				+
			
 
				+    unsigned int i;	
			
 
				+    for (i = 0; i < n_iterations; i++)
			
 
				+        VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
			
 
				+
			
 
				+    unsigned int remainder = n%4;
			
 
				+    if (remainder != 0)
			
 
				+    @{
			
 
				+        unsigned int start = 4 * n_iterations;
			
 
				+        for (i = start; i < start+remainder; ++i)
			
 
				+        @{
			
 
				+            vector[i] = factor * vector[i];
			
 
				+        @}
			
 
				+    @}
			
 
				+@}
			
 
				+@end smallexample
			
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -1,3 +1,11 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@smallexample
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 
			
@@ -24,3 +32,4 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 @}
			
 
				+@end smallexample
			
--- a/doc/vector_scal_opencl.texi
+++ b/doc/vector_scal_opencl.texi
@@ -1,3 +1,11 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@smallexample
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 
			
@@ -14,7 +22,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				     /* OpenCL copy of the vector pointer */
			
 
				-    cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				 
			
 
				     id = starpu_worker_get_id();
			
 
				     devid = starpu_worker_get_devid(id);
			
@@ -51,3 +59,4 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 
			
 
				     starpu_opencl_release_kernel(kernel);
			
 
				 @}
			
 
				+@end smallexample
			
--- a/doc/chapters/vector_scal_opencl_codelet.texi
+++ b/doc/chapters/vector_scal_opencl_codelet.texi
@@ -0,0 +1,16 @@
 
				+@c -*-texinfo-*-
			
 
				+
			
 
				+@c This file is part of the StarPU Handbook.
			
 
				+@c Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c See the file starpu.texi for copying conditions.
			
 
				+
			
 
				+@smallexample
			
 
				+__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				+@{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx) @{
			
 
				+                val[i] *= factor;
			
 
				+        @}
			
 
				+@}
			
 
				+@end smallexample
			
--- a/doc/starpu.css
+++ b/doc/starpu.css
@@ -1,3 +1,15 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * Permission is granted to copy, distribute and/or modify this document
			
 
				+ * under the terms of the GNU Free Documentation License, Version 1.3
			
 
				+ * or any later version published by the Free Software Foundation;
			
 
				+ * with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
			
 
				+ * See the GNU Free Documentation License in COPYING.GFDL for more details.
			
 
				+ */
			
 
				+
			
 
				 body {
			
 
				 	font-size: 13px;
			
 
				 /*	margin-top: 0px; */
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
--- a/doc/tutorial/Makefile
+++ b/doc/tutorial/Makefile
@@ -1,25 +1,39 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				+# Redistribution  and  use  in  source and binary forms, with or without
			
 
				+# modification,  are  permitted  provided  that the following conditions
			
 
				+# are met:
			
 
				 #
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+# * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+#   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+# * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+#   notice,  this list of conditions and the following disclaimer in the
			
 
				+#   documentation  and/or other materials provided with the distribution.
			
 
				+# * The name of the author may not be used to endorse or promote products
			
 
				+#   derived from this software without specific prior written permission.
			
 
				 #
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				-
			
 
				-CFLAGS          +=      $$(pkg-config --cflags libstarpu)
			
 
				-LDFLAGS         +=      $$(pkg-config --libs libstarpu)
			
 
				-
			
 
				-HAS_CUDA	=	$(shell pkg-config --libs libstarpu|grep -i cuda)
			
 
				+# THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+# ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+# LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+# A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+# SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+# LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+# DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+# THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+# (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+# OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+
			
 
				+CFLAGS          +=      $$(pkg-config --cflags libstarpu-1.0)
			
 
				+LDFLAGS         +=      $$(pkg-config --libs libstarpu-1.0)
			
 
				+
			
 
				+HAS_CUDA	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i cuda)
			
 
				 NVCC		?=	nvcc
			
 
				-HAS_OPENCL	=	$(shell pkg-config --libs libstarpu|grep -i opencl)
			
 
				+HAS_OPENCL	=	$(shell pkg-config --libs libstarpu-1.0 |grep -i opencl)
			
 
				 
			
 
				 %.o: %.cu
			
 
				 	nvcc $(CFLAGS) $< -c
			
--- a/doc/tutorial/README
+++ b/doc/tutorial/README
@@ -3,16 +3,29 @@
 
				 # Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				 # Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				 #
			
 
				-# StarPU is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU Lesser General Public License as published by
			
 
				-# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				-# your option) any later version.
			
 
				+# Redistribution  and  use  in  source and binary forms, with or without
			
 
				+# modification,  are  permitted  provided  that the following conditions
			
 
				+# are met:
			
 
				 #
			
 
				-# StarPU is distributed in the hope that it will be useful, but
			
 
				-# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+# * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+#   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+# * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+#   notice,  this list of conditions and the following disclaimer in the
			
 
				+#   documentation  and/or other materials provided with the distribution.
			
 
				+# * The name of the author may not be used to endorse or promote products
			
 
				+#   derived from this software without specific prior written permission.
			
 
				 #
			
 
				-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+# THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+# ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+# LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+# A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+# HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+# SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+# LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+# DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+# THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+# (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+# OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				 
			
 
				 
			
 
				 Instructions on how to compile and run StarPU examples
			
--- a/doc/tutorial/hello_world.c
+++ b/doc/tutorial/hello_world.c
@@ -3,16 +3,29 @@
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
@@ -29,10 +42,10 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				     printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				 }
			
 
				 
			
 
				-starpu_codelet cl =
			
 
				+struct starpu_codelet cl =
			
 
				 {
			
 
				     .where = STARPU_CPU,
			
 
				-    .cpu_func = cpu_func,
			
 
				+    .cpu_funcs = {cpu_func, NULL},
			
 
				     .nbuffers = 0
			
 
				 };
			
 
				 
			
--- a/doc/tutorial/vector_scal.c
+++ b/doc/tutorial/vector_scal.c
@@ -1,25 +1,38 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 /*
			
 
				  * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				  * It shows how to manipulate data with StarPU's data management library.
			
 
				  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				- *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				+ *  2- how to describe which data are accessed by a task (task->handle[0])
			
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 #include <starpu.h>
			
@@ -31,19 +44,20 @@ extern void scal_cpu_func(void *buffers[], void *_args);
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl = {
			
 
				     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				     /* CPU implementation of the codelet */
			
 
				-    .cpu_func = scal_cpu_func,
			
 
				+    .cpu_funcs = {scal_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				     /* CUDA implementation of the codelet */
			
 
				-    .cuda_func = scal_cuda_func,
			
 
				+    .cuda_funcs = {scal_cuda_func, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				     /* OpenCL implementation of the codelet */
			
 
				-    .opencl_func = scal_opencl_func,
			
 
				+    .opencl_funcs = {scal_opencl_func, NULL},
			
 
				 #endif
			
 
				-    .nbuffers = 1
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = {STARPU_RW}
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
@@ -81,7 +95,7 @@ int main(int argc, char **argv)
 
				      *  - the fourth argument is the number of elements in the vector
			
 
				      *  - the fifth argument is the size of each element.
			
 
				      */
			
 
				-    starpu_data_handle vector_handle;
			
 
				+    starpu_data_handle_t vector_handle;
			
 
				     starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
			
 
				                                 NX, sizeof(vector[0]));
			
 
				 
			
@@ -95,8 +109,7 @@ int main(int argc, char **argv)
 
				     task->cl = &cl;
			
 
				 
			
 
				     /* the codelet manipulates one buffer in RW mode */
			
 
				-    task->buffers[0].handle = vector_handle;
			
 
				-    task->buffers[0].mode = STARPU_RW;
			
 
				+    task->handles[0] = vector_handle;
			
 
				 
			
 
				     /* an argument is passed to the codelet, beware that this is a
			
 
				      * READ-ONLY buffer and that the codelet may be given a pointer to a
			
--- a/doc/tutorial/vector_scal_cpu.c
+++ b/doc/tutorial/vector_scal_cpu.c
@@ -1,18 +1,31 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
@@ -24,16 +37,16 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
				     float *factor = cl_arg;
			
 
				 
			
 
				     /*
			
 
				-     * The "buffers" array matches the task->buffers array: for instance
			
 
				-     * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				+     * The "buffers" array matches the task->handles array: for instance
			
 
				+     * task->handles[0] is a handle that corresponds to a data with
			
 
				      * vector "interface", so that the first entry of the array in the
			
 
				      * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				-     * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				+     * struct starpu_vector_interface *). Here, we therefore manipulate
			
 
				      * the buffers[0] element as a vector: nx gives the number of elements
			
 
				      * in the array, ptr gives the location of the array (that was possibly
			
 
				      * migrated/replicated), and elemsize gives the size of each elements.
			
 
				      */
			
 
				-    starpu_vector_interface_t *vector = buffers[0];
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				 
			
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(vector);
			
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -3,16 +3,29 @@
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
--- a/doc/tutorial/vector_scal_opencl.c
+++ b/doc/tutorial/vector_scal_opencl.c
@@ -3,16 +3,29 @@
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
@@ -31,7 +44,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				     /* length of the vector */
			
 
				     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				     /* OpenCL copy of the vector pointer */
			
 
				-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				 
			
 
				     id = starpu_worker_get_id();
			
 
				     devid = starpu_worker_get_devid(id);
			
--- a/doc/tutorial/vector_scal_opencl_kernel.cl
+++ b/doc/tutorial/vector_scal_opencl_kernel.cl
@@ -3,16 +3,29 @@
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				+ * Redistribution  and  use  in  source and binary forms, with or without
			
 
				+ * modification,  are  permitted  provided  that the following conditions
			
 
				+ * are met:
			
 
				  *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ * * Redistributions  of  source  code  must  retain  the above copyright
			
 
				+ *   notice,  this  list  of  conditions  and  the  following  disclaimer.
			
 
				+ * * Redistributions  in  binary  form must reproduce the above copyright
			
 
				+ *   notice,  this list of conditions and the following disclaimer in the
			
 
				+ *   documentation  and/or other materials provided with the distribution.
			
 
				+ * * The name of the author may not be used to endorse or promote products
			
 
				+ *   derived from this software without specific prior written permission.
			
 
				  *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ * THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+ * ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+ * LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+ * A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
			
 
				+ * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
			
 
				+ * LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
			
 
				+ * DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+ * (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+ * OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				  */
			
 
				 
			
 
				 __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
--- a/doc/vector_scal_cpu.texi
+++ b/doc/vector_scal_cpu.texi
@@ -1,32 +0,0 @@
 
				-#include <starpu.h>
			
 
				-
			
 
				-/* This kernel takes a buffer and scales it by a constant factor */
			
 
				-void scal_cpu_func(void *buffers[], void *cl_arg)
			
 
				-@{
			
 
				-    unsigned i;
			
 
				-    float *factor = cl_arg;
			
 
				-
			
 
				-    /* 
			
 
				-     * The "buffers" array matches the task->buffers array: for instance
			
 
				-     * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				-     * vector "interface", so that the first entry of the array in the
			
 
				-     * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				-     * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				-     * the buffers[0] element as a vector: nx gives the number of elements
			
 
				-     * in the array, ptr gives the location of the array (that was possibly
			
 
				-     * migrated/replicated), and elemsize gives the size of each elements.
			
 
				-     */
			
 
				-    starpu_vector_interface_t *vector = buffers[0];
			
 
				-
			
 
				-    /* length of the vector */
			
 
				-    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				-
			
 
				-    /* get a pointer to the local copy of the vector : note that we have to
			
 
				-     * cast it in (float *) since a vector could contain any type of
			
 
				-     * elements so that the .ptr field is actually a uintptr_t */
			
 
				-    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				-
			
 
				-    /* scale the vector */
			
 
				-    for (i = 0; i < n; i++)
			
 
				-        val[i] *= *factor;
			
 
				-@}
			
--- a/doc/vector_scal_opencl_codelet.texi
+++ b/doc/vector_scal_opencl_codelet.texi
@@ -1,7 +0,0 @@
 
				-__kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				-@{
			
 
				-        const int i = get_global_id(0);
			
 
				-        if (i < nx) @{
			
 
				-                val[i] *= factor;
			
 
				-        @}
			
 
				-@}
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,9 +1,9 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+# Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				-# Copyright (C) 2011  INRIA
			
 
				+# Copyright (C) 2011-2012  INRIA
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,7 +17,8 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				-LIBS = $(top_builddir)/src/libstarpu.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
			
 
				+AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
			
 
				 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
			
 
				 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
			
 
				 
			
@@ -25,17 +26,6 @@ AUTOMAKE_OPTIONS = subdir-objects
 
				 
			
 
				 SUBDIRS = stencil
			
 
				 
			
 
				-
			
 
				-if STARPU_USE_SOCL
			
 
				-SUBDIRS += socl
			
 
				-endif
			
 
				-
			
 
				-if STARPU_HAVE_FFTW
			
 
				-if STARPU_HAVE_FFTWF
			
 
				-SUBDIRS += starpufft
			
 
				-endif
			
 
				-endif
			
 
				-
			
 
				 BUILT_SOURCES =
			
 
				 
			
 
				 if STARPU_USE_OPENCL
			
@@ -44,6 +34,8 @@ endif
 
				 
			
 
				 EXTRA_DIST = 					\
			
 
				 	basic_examples/vector_scal_opencl_kernel.cl \
			
 
				+	basic_examples/multiformat_opencl_kernel.cl  \
			
 
				+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl \
			
 
				 	common/blas_model.c			\
			
 
				 	spmv/spmv_cuda.cu			\
			
 
				 	spmv/spmv_opencl.cl			\
			
@@ -61,7 +53,9 @@ EXTRA_DIST = 					\
 
				 	matvecmult/matvecmult_kernel.cl				\
			
 
				 	basic_examples/block_opencl_kernel.cl			\
			
 
				 	openmp/vector_scal.c			\
			
 
				-	filters/fblock_opencl_kernel.cl
			
 
				+	filters/fblock_opencl_kernel.cl		\
			
 
				+	filters/custom_mf/conversion_opencl.cl  \
			
 
				+	filters/custom_mf/custom_opencl.cl
			
 
				 
			
 
				 CLEANFILES = 					\
			
 
				 	gordon/null_kernel_gordon.spuelf
			
@@ -99,6 +93,12 @@ BUILT_SOURCES +=				\
 
				 
			
 
				 endif
			
 
				 
			
 
				+if STARPU_HAVE_ICC
			
 
				+.icc.o:
			
 
				+	$(ICC) -x c $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
			
 
				+		$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) $< -c -o $@
			
 
				+endif
			
 
				+
			
 
				 examplebindir = $(libdir)/starpu/examples/
			
 
				 
			
 
				 examplebin_PROGRAMS =
			
@@ -129,7 +129,11 @@ noinst_HEADERS = 				\
 
				 	spmv/matrix_market/mmio.h		\
			
 
				 	spmv/matrix_market/mm_to_bcsr.h		\
			
 
				 	spmv/spmv.h				\
			
 
				-	spmv/dw_block_spmv.h
			
 
				+	spmv/dw_block_spmv.h                    \
			
 
				+	basic_examples/multiformat_types.h      \
			
 
				+	filters/custom_mf/custom_interface.h    \
			
 
				+	filters/custom_mf/custom_types.h	\
			
 
				+	interface/complex_interface.h
			
 
				 
			
 
				 #####################################
			
 
				 # What to install and what to check #
			
@@ -147,9 +151,10 @@ endif
 
				 if !STARPU_HAVE_WINDOWS
			
 
				 ## test loader program
			
 
				 LOADER			=	loader
			
 
				+loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
			
 
				 loader_SOURCES		=	../tests/loader.c
			
 
				-TESTS_ENVIRONMENT	=	$(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 examplebin_PROGRAMS +=				\
			
@@ -158,17 +163,21 @@ examplebin_PROGRAMS +=				\
 
				 	basic_examples/mult			\
			
 
				 	basic_examples/block			\
			
 
				 	basic_examples/variable			\
			
 
				-	basic_examples/mult_impl                \
			
 
				+	basic_examples/multiformat              \
			
 
				+	cpp/incrementer_cpp			\
			
 
				+	filters/custom_mf/custom_mf_filter      \
			
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
 
				 	tag_example/tag_example			\
			
 
				-	tag_example/tag_example3		\
			
 
				 	tag_example/tag_example2		\
			
 
				+	tag_example/tag_example3		\
			
 
				+	tag_example/tag_example4		\
			
 
				 	tag_example/tag_restartable		\
			
 
				 	spmv/spmv				\
			
 
				 	callback/callback			\
			
 
				 	incrementer/incrementer			\
			
 
				+	interface/complex			\
			
 
				 	matvecmult/matvecmult			\
			
 
				 	profiling/profiling			\
			
 
				 	reductions/dot_product			\
			
@@ -218,16 +227,20 @@ STARPU_EXAMPLES +=				\
 
				 	basic_examples/mult			\
			
 
				 	basic_examples/block			\
			
 
				 	basic_examples/variable			\
			
 
				+	basic_examples/multiformat              \
			
 
				+	cpp/incrementer_cpp			\
			
 
				 	filters/fvector				\
			
 
				 	filters/fblock				\
			
 
				 	filters/fmatrix				\
			
 
				 	tag_example/tag_example			\
			
 
				-	tag_example/tag_example3		\
			
 
				 	tag_example/tag_example2		\
			
 
				+	tag_example/tag_example3		\
			
 
				+	tag_example/tag_example4		\
			
 
				 	tag_example/tag_restartable		\
			
 
				 	spmv/spmv				\
			
 
				 	callback/callback			\
			
 
				 	incrementer/incrementer			\
			
 
				+	interface/complex			\
			
 
				 	matvecmult/matvecmult			\
			
 
				 	profiling/profiling			\
			
 
				 	scheduler/dummy_sched			\
			
@@ -277,6 +290,12 @@ basic_examples_vector_scal_SOURCES =		\
 
				 	basic_examples/vector_scal.c		\
			
 
				 	basic_examples/vector_scal_cpu.c
			
 
				 
			
 
				+if STARPU_HAVE_ICC
			
 
				+basic_examples_vector_scal_SOURCES +=		\
			
 
				+	basic_examples/vector_scal_cpu_icc.icc
			
 
				+basic_examples/vector_scal_cpu_icc.o: CFLAGS += -Dscal_cpu_func=scal_cpu_func_icc -Dscal_sse_func=scal_sse_func_icc
			
 
				+endif
			
 
				+
			
 
				 if STARPU_USE_CUDA
			
 
				 basic_examples_vector_scal_SOURCES +=		\
			
 
				 	basic_examples/vector_scal_cuda.cu
			
@@ -303,6 +322,29 @@ basic_examples_vector_scal_fortran_LDADD =	\
 
				 endif
			
 
				 endif
			
 
				 
			
 
				+#######################
			
 
				+# Multiformat example #
			
 
				+#######################
			
 
				+basic_examples_multiformat_SOURCES =                                    \
			
 
				+	basic_examples/multiformat.c                                    \
			
 
				+	basic_examples/multiformat_conversion_codelets.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+basic_examples_multiformat_SOURCES+=                                     \
			
 
				+	basic_examples/multiformat_cuda.cu                               \
			
 
				+	basic_examples/multiformat_conversion_codelets_cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+basic_examples_multiformat_SOURCES+=                                     \
			
 
				+	basic_examples/multiformat_opencl.c                              \
			
 
				+	basic_examples/multiformat_conversion_codelets_opencl.c          
			
 
				+
			
 
				+nobase_STARPU_OPENCL_DATA_DATA+=                                         \
			
 
				+	basic_examples/multiformat_opencl_kernel.cl                      \
			
 
				+	basic_examples/multiformat_conversion_codelets_opencl_kernel.cl
			
 
				+endif
			
 
				+
			
 
				 #################
			
 
				 # block example #
			
 
				 #################
			
@@ -362,6 +404,30 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
				 	filters/fblock_opencl_kernel.cl
			
 
				 endif
			
 
				 
			
 
				+
			
 
				+#############################
			
 
				+# Custom multiformat filter #
			
 
				+#############################
			
 
				+filters_custom_mf_custom_mf_filter_SOURCES=\
			
 
				+	filters/custom_mf/custom_mf_filter.c \
			
 
				+	filters/custom_mf/custom_interface.c   \
			
 
				+	filters/custom_mf/custom_conversion_codelets.c
			
 
				+
			
 
				+if STARPU_USE_CUDA
			
 
				+filters_custom_mf_custom_mf_filter_SOURCES+=\
			
 
				+	filters/custom_mf/conversion.cu \
			
 
				+	filters/custom_mf/cuda.cu
			
 
				+endif
			
 
				+
			
 
				+if STARPU_USE_OPENCL
			
 
				+filters_custom_mf_custom_mf_filter_SOURCES+=\
			
 
				+	filters/custom_mf/conversion_opencl.c \
			
 
				+	filters/custom_mf/custom_opencl.c
			
 
				+nobase_STARPU_OPENCL_DATA_DATA += \
			
 
				+	filters/custom_mf/conversion_opencl.cl \
			
 
				+	filters/custom_mf/custom_opencl.cl
			
 
				+endif
			
 
				+
			
 
				 ################
			
 
				 # AXPY example #
			
 
				 ################
			
@@ -606,11 +672,22 @@ spmv_dw_block_spmv_LDADD =			\
 
				 	$(STARPU_BLAS_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				+###########################
			
 
				+# C++ Incrementer example #
			
 
				+###########################
			
 
				+
			
 
				+cpp_incrementer_cpp_SOURCES	=	\
			
 
				+	cpp/incrementer_cpp.cpp
			
 
				+
			
 
				+#if STARPU_USE_CUDA
			
 
				+#cpp_incrementer_cpp_SOURCES +=	\
			
 
				+#	incrementer/incrementer_kernels.cu
			
 
				+#endif
			
 
				+
			
 
				 #######################
			
 
				 # Incrementer example #
			
 
				 #######################
			
 
				 
			
 
				-
			
 
				 incrementer_incrementer_SOURCES =	\
			
 
				 	incrementer/incrementer.c
			
 
				 if STARPU_USE_CUDA
			
@@ -624,6 +701,18 @@ nobase_STARPU_OPENCL_DATA_DATA += \
 
				 	incrementer/incrementer_kernels_opencl_kernel.cl
			
 
				 endif
			
 
				 
			
 
				+#####################
			
 
				+# interface example #
			
 
				+#####################
			
 
				+
			
 
				+interface_complex_SOURCES	=	\
			
 
				+	interface/complex.c		\
			
 
				+	interface/complex_interface.c
			
 
				+if STARPU_USE_CUDA
			
 
				+interface_complex_SOURCES	+=	\
			
 
				+	interface/complex_kernels.cu
			
 
				+endif
			
 
				+
			
 
				 ######################
			
 
				 # matVecMult example #
			
 
				 ######################
			
@@ -651,7 +740,7 @@ endif
 
				 mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
			
 
				 if HAVE_X11
			
 
				 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
			
 
				-mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
			
 
				+mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
			
 
				 endif
			
 
				 
			
 
				 ################
			
@@ -663,3 +752,9 @@ examplebin_PROGRAMS +=				\
 
				 
			
 
				 top_hello_world_top_SOURCES =			\
			
 
				 	top/hello_world_top.c
			
 
				+
			
 
				+showcheck:
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
 
				+	for i in $(SUBDIRS) ; do \
			
 
				+		make -C $$i showcheck ; \
			
 
				+	done
			
--- a/examples/audio/Makefile
+++ b/examples/audio/Makefile
@@ -1,3 +1,19 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				 CFLAGS += -Wall -g3 -gdwarf-2 -O3 
			
 
				 
			
 
				 LIBS+=$$(pkg-config --libs libstarpu) -lcufft
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -57,7 +57,7 @@ unsigned length_data;
 
				 /* buffer containing input WAV data */
			
 
				 float *A;
			
 
				 
			
 
				-starpu_data_handle A_handle;
			
 
				+starpu_data_handle_t A_handle;
			
 
				 
			
 
				 /* For performance evaluation */
			
 
				 static struct timeval start;
			
@@ -101,7 +101,8 @@ void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_fil
 
				 	/* we skip the header to only keep the data */
			
 
				 	fseek(infile, headersize, SEEK_SET);
			
 
				 	
			
 
				-	for (v=0;v<size;v++) {
			
 
				+	for (v=0;v<size;v++)
			
 
				+	{
			
 
				 		signed char val = (signed char)fgetc(infile);
			
 
				 		signed char val2 = (signed char)fgetc(infile);
			
 
				 
			
@@ -124,7 +125,8 @@ void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_fi
 
				 	/* we assume that the header is copied using copy_wav_header */
			
 
				 	fseek(outfile, headersize, SEEK_SET);
			
 
				 	
			
 
				-	for (v=0;v<size;v++) {
			
 
				+	for (v=0;v<size;v++)
			
 
				+	{
			
 
				 		signed char val = ((int)arrayin[v]) % 256; 
			
 
				 		signed char val2  = ((int)arrayin[v]) / 256;
			
 
				 
			
@@ -146,7 +148,8 @@ void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_fi
 
				  */
			
 
				 
			
 
				 /* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
			
 
				-typedef struct {
			
 
				+typedef struct
			
 
				+{
			
 
				 	unsigned is_initialized;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	cufftHandle plan;
			
@@ -268,17 +271,20 @@ static void band_filter_kernel_cpu(void *descr[], __attribute__((unused)) void *
 
				 		localA[i] /= nsamples;
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_t band_filter_model = {
			
 
				+struct starpu_perfmodel band_filter_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "FFT_band_filter"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet band_filter_cl = {
			
 
				+static struct starpu_codelet band_filter_cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = band_filter_kernel_gpu,
			
 
				+	.cuda_funcs = {band_filter_kernel_gpu, NULL},
			
 
				 #endif
			
 
				-	.cpu_func = band_filter_kernel_cpu,
			
 
				+	.cpu_funcs = {band_filter_kernel_cpu, NULL},
			
 
				 	.model = &band_filter_model,
			
 
				 	.nbuffers = 1
			
 
				 };
			
@@ -292,17 +298,18 @@ void callback(void *arg)
 
				 
			
 
				 void create_starpu_task(unsigned iter)
			
 
				 {
			
 
				+	int ret;
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				 
			
 
				 	task->cl = &band_filter_cl;
			
 
				 
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, iter);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(A_handle, 1, iter);
			
 
				 
			
 
				 	task->callback_func = callback;
			
 
				 	task->callback_arg = NULL;
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 static void init_problem(void)
			
@@ -330,7 +337,8 @@ static void init_problem(void)
 
				 	{
			
 
				 		starpu_malloc((void **)&A, length_data*sizeof(float));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		A = malloc(length_data*sizeof(float));
			
 
				 	}
			
 
				 
			
@@ -344,31 +352,38 @@ static void init_problem(void)
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-h") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
			
 
				 			exit(-1);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-i") == 0) {
			
 
				+		if (strcmp(argv[i], "-i") == 0)
			
 
				+		{
			
 
				 			inputfilename = argv[++i];;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-o") == 0) {
			
 
				+		if (strcmp(argv[i], "-o") == 0)
			
 
				+		{
			
 
				 			outputfilename = argv[++i];;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-no-output") == 0) {
			
 
				+		if (strcmp(argv[i], "-no-output") == 0)
			
 
				+		{
			
 
				 			outputfilename = NULL;;
			
 
				 		}
			
 
				 
			
 
				 		/* block size */
			
 
				-		if (strcmp(argv[i], "-nsamples") == 0) {
			
 
				+		if (strcmp(argv[i], "-nsamples") == 0)
			
 
				+		{
			
 
				 			char *argptr;
			
 
				 			nsamples = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				+		if (strcmp(argv[i], "-pin") == 0)
			
 
				+		{
			
 
				 			use_pin = 1;
			
 
				 		}
			
 
				 	}
			
@@ -377,6 +392,7 @@ static void parse_args(int argc, char **argv)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	unsigned iter;
			
 
				+	int ret;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
@@ -389,11 +405,12 @@ int main(int argc, char **argv)
 
				 	fprintf(stderr, "input: %s\noutput: %s\n#chunks %d\n", inputfilename, outputfilename, niter);
			
 
				 
			
 
				 	/* launch StarPU */
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f = 
			
 
				+	struct starpu_data_filter f =
			
 
				 	{
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				 		.nchildren = niter
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -41,7 +41,7 @@
 
				 TYPE *vec_x, *vec_y;
			
 
				 
			
 
				 /* descriptors for StarPU */
			
 
				-starpu_data_handle handle_y, handle_x;
			
 
				+starpu_data_handle_t handle_y, handle_x;
			
 
				 
			
 
				 void axpy_cpu(void *descr[], __attribute__((unused)) void *arg)
			
 
				 {
			
@@ -70,28 +70,35 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static starpu_codelet axpy_cl = {
			
 
				+static struct starpu_codelet axpy_cl =
			
 
				+{
			
 
				         .where =
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				                 STARPU_CUDA|
			
 
				 #endif
			
 
				                 STARPU_CPU,
			
 
				 
			
 
				-	.cpu_func = axpy_cpu,
			
 
				+	.cpu_funcs = {axpy_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = axpy_gpu,
			
 
				+	.cuda_funcs = {axpy_gpu, NULL},
			
 
				 #endif
			
 
				-	.nbuffers = 2
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW}
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* Initialize StarPU */
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				-	/* This is equivalent to 
			
 
				+	/* This is equivalent to
			
 
				 		vec_a = malloc(N*sizeof(TYPE));
			
 
				 		vec_b = malloc(N*sizeof(TYPE));
			
 
				 	*/
			
@@ -116,7 +123,8 @@ int main(int argc, char **argv)
 
				 	starpu_vector_data_register(&handle_y, 0, (uintptr_t)vec_y, N, sizeof(TYPE));
			
 
				 
			
 
				 	/* Divide the vector into blocks */
			
 
				-	struct starpu_data_filter block_filter = {
			
 
				+	struct starpu_data_filter block_filter =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func_vector,
			
 
				 		.nchildren = NBLOCKS
			
 
				 	};
			
@@ -128,7 +136,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				-	
			
 
				+
			
 
				 	gettimeofday(&start, NULL);
			
 
				 
			
 
				 	unsigned b;
			
@@ -140,13 +148,11 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->cl_arg = &alpha;
			
 
				 
			
 
				-		task->buffers[0].handle = starpu_data_get_sub_data(handle_x, 1, b);
			
 
				-		task->buffers[0].mode = STARPU_R;
			
 
				-		
			
 
				-		task->buffers[1].handle = starpu_data_get_sub_data(handle_y, 1, b);
			
 
				-		task->buffers[1].mode = STARPU_RW;
			
 
				-		
			
 
				-		starpu_task_submit(task);
			
 
				+		task->handles[0] = starpu_data_get_sub_data(handle_x, 1, b);
			
 
				+		task->handles[1] = starpu_data_get_sub_data(handle_y, 1, b);
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -35,29 +35,30 @@ typedef void (*device_func)(void **, void *);
 
				 
			
 
				 int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny, int pnz, float multiplier)
			
 
				 {
			
 
				-	starpu_codelet cl = {};
			
 
				-	starpu_data_handle block_handle;
			
 
				+	struct starpu_codelet cl = {};
			
 
				+	starpu_data_handle_t block_handle;
			
 
				         int i;
			
 
				 
			
 
				 	starpu_block_data_register(&block_handle, 0, (uintptr_t)block, pnx, pnx*pny, pnx, pny, pnz, sizeof(float));
			
 
				 
			
 
				 	cl.where = where;
			
 
				-        cl.cuda_func = func;
			
 
				-        cl.cpu_func = func;
			
 
				-        cl.opencl_func = func;
			
 
				+        cl.cuda_funcs[0] = func;
			
 
				+        cl.cpu_funcs[0] = func;
			
 
				+        cl.opencl_funcs[0] = func;
			
 
				         cl.nbuffers = 1;
			
 
				+	cl.modes[0] = STARPU_RW,
			
 
				         cl.model = NULL;
			
 
				 
			
 
				         struct starpu_task *task = starpu_task_create();
			
 
				         task->cl = &cl;
			
 
				         task->callback_func = NULL;
			
 
				-        task->buffers[0].handle = block_handle;
			
 
				-        task->buffers[0].mode = STARPU_RW;
			
 
				+        task->handles[0] = block_handle;
			
 
				 	task->cl_arg = &multiplier;
			
 
				 	task->cl_arg_size = sizeof(multiplier);
			
 
				 
			
 
				         int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 return 1;
			
 
				 	}
			
@@ -67,8 +68,9 @@ int execute_on(uint32_t where, device_func func, float *block, int pnx, int pny,
 
				 	/* update the array in RAM */
			
 
				 	starpu_data_unregister(block_handle);
			
 
				 
			
 
				-        for(i=0 ; i<pnx*pny*pnz; i++) {
			
 
				-          FPRINTF(stderr, "%f ", block[i]);
			
 
				+        for(i=0 ; i<pnx*pny*pnz; i++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "%f ", block[i]);
			
 
				         }
			
 
				         FPRINTF(stderr, "\n");
			
 
				 
			
@@ -84,13 +86,19 @@ int main(int argc, char **argv)
 
				         int nz=4;
			
 
				         float multiplier=1.0;
			
 
				 
			
 
				-        starpu_init(NULL);
			
 
				+        ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				         block = (float*)malloc(nx*ny*nz*sizeof(float));
			
 
				         assert(block);
			
 
				-        for(k=0 ; k<nz ; k++) {
			
 
				-                for(j=0 ; j<ny ; j++) {
			
 
				-                        for(i=0 ; i<nx ; i++) {
			
 
				+        for(k=0 ; k<nz ; k++)
			
 
				+	{
			
 
				+                for(j=0 ; j<ny ; j++)
			
 
				+		{
			
 
				+                        for(i=0 ; i<nx ; i++)
			
 
				+			{
			
 
				                                 block[(k*nx*ny)+(j*nx)+i] = n++;
			
 
				                         }
			
 
				                 }
			
@@ -99,7 +107,8 @@ int main(int argc, char **argv)
 
				         ret = execute_on(STARPU_CPU, cpu_codelet, block, nx, ny, nz, 1.0);
			
 
				         if (!ret) multiplier *= 1.0;
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
			
 
				+        ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/block_opencl_kernel.cl", &opencl_code, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				         ret = execute_on(STARPU_OPENCL, opencl_codelet, block, nx, ny, nz, 2.0);
			
 
				         if (!ret) multiplier *= 2.0;
			
 
				 #endif
			
@@ -110,11 +119,13 @@ int main(int argc, char **argv)
 
				 
			
 
				         /* Check result is correct */
			
 
				         ret=1;
			
 
				-        for(i=0 ; i<nx*ny*nz ; i++) {
			
 
				-          if (block[i] != (i+1) * multiplier) {
			
 
				-            ret=0;
			
 
				-            break;
			
 
				-          }
			
 
				+        for(i=0 ; i<nx*ny*nz ; i++)
			
 
				+	{
			
 
				+		if (block[i] != (i+1) * multiplier)
			
 
				+		{
			
 
				+			ret=0;
			
 
				+			break;
			
 
				+		}
			
 
				         }
			
 
				 
			
 
				         FPRINTF(stderr,"TEST %s\n", ret==1?"PASSED":"FAILED");
			
--- a/examples/basic_examples/block_cpu.c
+++ b/examples/basic_examples/block_cpu.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,8 +28,10 @@ void cpu_codelet(void *descr[], void *_args)
 
				         float *multiplier = (float *)_args;
			
 
				         unsigned i, j, k;
			
 
				 
			
 
				-        for(k=0; k<nz ; k++) {
			
 
				-                for(j=0; j<ny ; j++) {
			
 
				+        for(k=0; k<nz ; k++)
			
 
				+	{
			
 
				+                for(j=0; j<ny ; j++)
			
 
				+		{
			
 
				                         for(i=0; i<nx ; i++)
			
 
				                                 block[(k*ldz)+(j*ldy)+i] *= *multiplier;
			
 
				                 }
			
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,8 +20,10 @@
 
				 static __global__ void cuda_block(float *block, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
			
 
				 {
			
 
				         int i, j, k;
			
 
				-        for(k=0; k<nz ; k++) {
			
 
				-                for(j=0; j<ny ; j++) {
			
 
				+        for(k=0; k<nz ; k++)
			
 
				+	{
			
 
				+                for(j=0; j<ny ; j++)
			
 
				+		{
			
 
				                         for(i=0; i<nx ; i++)
			
 
				                                 block[(k*ldz)+(j*ldy)+i] *= multiplier;
			
 
				                 }
			
--- a/examples/basic_examples/block_opencl.c
+++ b/examples/basic_examples/block_opencl.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -18,6 +18,15 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 
			
 
				+#define CHECK_CL_SET_KERNEL_ARG(kernel, n, size, ptr)       \
			
 
				+do						    	    \
			
 
				+{							    \
			
 
				+	int err;                                            \
			
 
				+	err = clSetKernelArg(kernel, n, size, ptr);         \
			
 
				+	if (err != CL_SUCCESS)                              \
			
 
				+       		STARPU_OPENCL_REPORT_ERROR(err);            \
			
 
				+} while (0)
			
 
				+
			
 
				 extern struct starpu_opencl_program opencl_code;
			
 
				 
			
 
				 void opencl_codelet(void *descr[], void *_args)
			
@@ -26,7 +35,7 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	cl_command_queue queue;
			
 
				 	cl_event event;
			
 
				 	int id, devid, err;
			
 
				-	cl_mem block = (cl_mem)STARPU_BLOCK_GET_PTR(descr[0]);
			
 
				+	cl_mem block = (cl_mem)STARPU_BLOCK_GET_DEV_HANDLE(descr[0]);
			
 
				 	int nx = (int)STARPU_BLOCK_GET_NX(descr[0]);
			
 
				 	int ny = (int)STARPU_BLOCK_GET_NY(descr[0]);
			
 
				 	int nz = (int)STARPU_BLOCK_GET_NZ(descr[0]);
			
@@ -40,15 +49,13 @@ void opencl_codelet(void *descr[], void *_args)
 
				         err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_code, "block", devid);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = 0;
			
 
				-	err = clSetKernelArg(kernel, 0, sizeof(block), &block);
			
 
				-	err = clSetKernelArg(kernel, 1, sizeof(nx), &nx);
			
 
				-	err = clSetKernelArg(kernel, 2, sizeof(ny), &ny);
			
 
				-	err = clSetKernelArg(kernel, 3, sizeof(nz), &nz);
			
 
				-	err = clSetKernelArg(kernel, 4, sizeof(ldy), &ldy);
			
 
				-	err = clSetKernelArg(kernel, 5, sizeof(ldz), &ldz);
			
 
				-	err = clSetKernelArg(kernel, 6, sizeof(*multiplier), multiplier);
			
 
				-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 0, sizeof(block), &block);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 1, sizeof(nx), &nx);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 2, sizeof(ny), &ny);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 3, sizeof(nz), &nz);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 4, sizeof(ldy), &ldy);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 5, sizeof(ldz), &ldz);
			
 
				+	CHECK_CL_SET_KERNEL_ARG(kernel, 6, sizeof(*multiplier), multiplier);
			
 
				 
			
 
				 	{
			
 
				                 size_t global=nx*ny*nz;
			
--- a/examples/basic_examples/block_opencl_kernel.cl
+++ b/examples/basic_examples/block_opencl_kernel.cl
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,8 +17,10 @@
 
				 __kernel void block(__global float *b, int nx, int ny, int nz, unsigned ldy, unsigned ldz, float multiplier)
			
 
				 {
			
 
				         int i, j, k;
			
 
				-        for(k=0; k<nz ; k++) {
			
 
				-                for(j=0; j<ny ; j++) {
			
 
				+        for(k=0; k<nz ; k++)
			
 
				+	{
			
 
				+                for(j=0; j<ny ; j++)
			
 
				+		{
			
 
				                         for(i=0; i<nx ; i++)
			
 
				                                 b[(k*ldz)+(j*ldy)+i] *= multiplier;
			
 
				                 }
			
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -46,10 +46,12 @@ void callback_func(void *callback_arg)
 
				  * DSM; the second arguments references read-only data that is passed as an
			
 
				  * argument of the codelet (task->cl_arg). Here, "buffers" is unused as there
			
 
				  * are no data input/output managed by the DSM (cl.nbuffers = 0) */
			
 
				-struct params {
			
 
				+struct params
			
 
				+{
			
 
				 	int i;
			
 
				 	float f;
			
 
				 };
			
 
				+
			
 
				 void cpu_func(void *buffers[], void *cl_arg)
			
 
				 {
			
 
				 	struct params *params = (struct params *) cl_arg;
			
@@ -57,17 +59,21 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 	FPRINTF(stdout, "Hello world (params = {%i, %f} )\n", params->i, params->f);
			
 
				 }
			
 
				 
			
 
				-starpu_codelet cl = {};
			
 
				+struct starpu_codelet cl = {};
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	struct starpu_task *task;
			
 
				 	struct params params = {1, 2.0f};
			
 
				+	int ret;
			
 
				 
			
 
				 	/* initialize StarPU : passing a NULL argument means that we use
			
 
				  	* default configuration for the scheduling policies and the number of
			
 
				 	* processors/accelerators */
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* create a new task that is non-blocking by default : the task is not
			
 
				 	 * submitted to the scheduler until the starpu_task_submit function is
			
@@ -77,7 +83,7 @@ int main(int argc, char **argv)
 
				 	/* this codelet may only be executed on a CPU, and its cpu
			
 
				  	 * implementation is function "cpu_func" */
			
 
				 	cl.where = STARPU_CPU;
			
 
				-	cl.cpu_func = cpu_func;
			
 
				+	cl.cpu_funcs[0] = cpu_func;
			
 
				 	/* the codelet does not manipulate any data that is managed
			
 
				 	 * by our DSM */
			
 
				 	cl.nbuffers = 0;
			
@@ -95,7 +101,7 @@ int main(int argc, char **argv)
 
				 	 * argument (cl_arg) is NOT a valid synchronization medium! */
			
 
				 	task->cl_arg = &params;
			
 
				 	task->cl_arg_size = sizeof(params);
			
 
				-		
			
 
				+
			
 
				 	/* once the task has been executed, callback_func(0x42)
			
 
				 	 * will be called on a CPU */
			
 
				 	task->callback_func = callback_func;
			
@@ -103,13 +109,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* starpu_task_submit will be a blocking call */
			
 
				 	task->synchronous = 1;
			
 
				-	
			
 
				+
			
 
				 	/* submit the task to StarPU */
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				-	/* destroy the task */
			
 
				-	starpu_task_destroy(task);
			
 
				-	
			
 
				 	/* terminate StarPU: statistics and other debug outputs are not
			
 
				 	 * guaranteed to be generated unless this function is called. Once it
			
 
				 	 * is called, it is not possible to submit tasks anymore, and the user
			
@@ -119,4 +124,8 @@ int main(int argc, char **argv)
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -27,7 +27,7 @@
 
				  *  - how to unpartition data (starpu_data_unpartition) and how to stop
			
 
				  *    monitoring data (starpu_data_unregister)
			
 
				  *  - how to manipulate subsets of data (starpu_data_get_sub_data)
			
 
				- *  - how to construct an autocalibrated performance model (starpu_perfmodel_t)
			
 
				+ *  - how to construct an autocalibrated performance model (starpu_perfmodel)
			
 
				  *  - how to submit asynchronous tasks
			
 
				  */
			
 
				 
			
@@ -41,7 +41,7 @@
 
				 #include <starpu.h>
			
 
				 
			
 
				 static float *A, *B, *C;
			
 
				-static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				+static starpu_data_handle_t A_handle, B_handle, C_handle;
			
 
				 
			
 
				 static unsigned nslicesx = 4;
			
 
				 static unsigned nslicesy = 4;
			
@@ -133,20 +133,26 @@ static void init_problem_data(void)
 
				 
			
 
				 	/* fill the A and B matrices */
			
 
				 	srand(2009);
			
 
				-	for (j=0; j < ydim; j++) {
			
 
				-		for (i=0; i < zdim; i++) {
			
 
				+	for (j=0; j < ydim; j++)
			
 
				+	{
			
 
				+		for (i=0; i < zdim; i++)
			
 
				+		{
			
 
				 			A[j+i*ydim] = (float)(starpu_drand48());
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for (j=0; j < zdim; j++) {
			
 
				-		for (i=0; i < xdim; i++) {
			
 
				+	for (j=0; j < zdim; j++)
			
 
				+	{
			
 
				+		for (i=0; i < xdim; i++)
			
 
				+		{
			
 
				 			B[j+i*zdim] = (float)(starpu_drand48());
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for (j=0; j < ydim; j++) {
			
 
				-		for (i=0; i < xdim; i++) {
			
 
				+	for (j=0; j < ydim; j++)
			
 
				+	{
			
 
				+		for (i=0; i < xdim; i++)
			
 
				+		{
			
 
				 			C[j+i*ydim] = (float)(0);
			
 
				 		}
			
 
				 	}
			
@@ -186,16 +192,18 @@ static void partition_mult_data(void)
 
				 	/* StarPU supplies some basic filters such as the partition of a matrix
			
 
				 	 * into blocks, note that we are using a FORTRAN ordering so that the
			
 
				 	 * name of the filters are a bit misleading */
			
 
				-	struct starpu_data_filter vert = {
			
 
				+	struct starpu_data_filter vert =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nslicesx
			
 
				 	};
			
 
				-		
			
 
				-	struct starpu_data_filter horiz = {
			
 
				+
			
 
				+	struct starpu_data_filter horiz =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nslicesy
			
 
				 	};
			
 
				-		
			
 
				+
			
 
				 /*
			
 
				  *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
			
 
				  *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
			
@@ -246,25 +254,29 @@ static void partition_mult_data(void)
 
				 	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t mult_perf_model = {
			
 
				+static struct starpu_perfmodel mult_perf_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "mult_perf_model"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				         /* we can only execute that kernel on a CPU yet */
			
 
				         .where = STARPU_CPU,
			
 
				         /* CPU implementation of the codelet */
			
 
				-        .cpu_func = cpu_mult,
			
 
				+        .cpu_funcs = {cpu_mult, NULL},
			
 
				         /* the codelet manipulates 3 buffers that are managed by the
			
 
				          * DSM */
			
 
				         .nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_W},
			
 
				         /* in case the scheduling policy may use performance models */
			
 
				         .model = &mult_perf_model
			
 
				 };
			
 
				 
			
 
				-static void launch_tasks(void)
			
 
				+static int launch_tasks(void)
			
 
				 {
			
 
				+	int ret;
			
 
				 	/* partition the work into slices */
			
 
				 	unsigned taskx, tasky;
			
 
				 
			
@@ -301,10 +313,8 @@ static void launch_tasks(void)
 
				 			 * identified by "tasky" (respectively "taskx). The "1"
			
 
				 			 * tells StarPU that there is a single argument to the
			
 
				 			 * variable-arity function starpu_data_get_sub_data */
			
 
				-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				-			task->buffers[1].mode = STARPU_R;
			
 
				+			task->handles[0] = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				+			task->handles[1] = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				 
			
 
				 			/* 2 filters were applied on matrix C, so we give
			
 
				 			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
			
@@ -315,20 +325,27 @@ static void launch_tasks(void)
 
				 			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
			
 
				 			 * equivalent to
			
 
				 			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
			
 
				-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				-			task->buffers[2].mode = STARPU_W;
			
 
				+			task->handles[2] = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				 
			
 
				 			/* this is not a blocking call since task->synchronous = 0 */
			
 
				-			starpu_task_submit(task);
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			if (ret == -ENODEV) return ret;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 int main(__attribute__ ((unused)) int argc, 
			
 
				 	 __attribute__ ((unused)) char **argv)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* start the runtime */
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* initialize matrices A, B and C and register them to StarPU */
			
 
				 	init_problem_data();
			
@@ -338,7 +355,8 @@ int main(__attribute__ ((unused)) int argc,
 
				 	partition_mult_data();
			
 
				 
			
 
				 	/* submit all tasks in an asynchronous fashion */
			
 
				-	launch_tasks();
			
 
				+	ret = launch_tasks();
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				 
			
 
				 	/* wait for termination */
			
 
				         starpu_task_wait_for_all();
			
@@ -367,4 +385,9 @@ int main(__attribute__ ((unused)) int argc,
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				 }
			
 
				+
			
--- a/examples/basic_examples/mult_impl.c
+++ b/examples/basic_examples/mult_impl.c
@@ -1,384 +0,0 @@
 
				-/*/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Télécom-SudParis
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-
			
 
				-#include <string.h>
			
 
				-#include <math.h>
			
 
				-#include <sys/types.h>
			
 
				-#include <sys/time.h>
			
 
				-#include <pthread.h>
			
 
				-#include <signal.h>
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-static float *A, *B, *C;
			
 
				-static starpu_data_handle A_handle, B_handle, C_handle;
			
 
				-
			
 
				-static unsigned nslicesx = 4;
			
 
				-static unsigned nslicesy = 4;
			
 
				-static unsigned xdim = 1024;
			
 
				-static unsigned ydim = 1024;
			
 
				-static unsigned zdim = 512;
			
 
				-
			
 
				-
			
 
				-double mult_gemm_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	/* C = A * B */
			
 
				-	uint32_t nxC, nyC, nxA;
			
 
				-
			
 
				-
			
 
				-	nxC = starpu_matrix_get_nx(descr[2].handle);
			
 
				-	nyC = starpu_matrix_get_ny(descr[2].handle);
			
 
				-	nxA = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	//printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
			
 
				-
			
 
				-	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
			
 
				-
			
 
				-	printf("cost %e \n", cost);
			
 
				-
			
 
				-	return cost;
			
 
				-}
			
 
				-
			
 
				-static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
			
 
				-{
			
 
				-	float *subA, *subB, *subC;
			
 
				-	uint32_t nxC, nyC, nyA;
			
 
				-	uint32_t ldA, ldB, ldC;
			
 
				-	printf("On application: Hello, this is kernel cpu_mult\n\n");
			
 
				-	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				-
			
 
				-	/* .blas.nx is the number of rows (consecutive elements) and .blas.ny
			
 
				-	 * is the number of lines that are separated by .blas.ld elements (ld
			
 
				-	 * stands for leading dimension).
			
 
				-	 * NB: in case some filters were used, the leading dimension is not
			
 
				-	 * guaranteed to be the same in main memory (on the original matrix)
			
 
				-	 * and on the accelerator! */
			
 
				-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-
			
 
				-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-	/* we assume a FORTRAN-ordering! */
			
 
				-	unsigned i,j,k;
			
 
				-	for (i = 0; i < nyC; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < nxC; j++)
			
 
				-		{
			
 
				-			float sum = 0.0;
			
 
				-
			
 
				-			for (k = 0; k < nyA; k++)
			
 
				-			{
			
 
				-				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				-			}
			
 
				-
			
 
				-			subC[j + i*ldC] = sum;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void cpu_mult_2(void *descr[], __attribute__((unused))  void *arg)
			
 
				-{
			
 
				-	float *subA, *subB, *subC;
			
 
				-	uint32_t nxC, nyC, nyA;
			
 
				-	uint32_t ldA, ldB, ldC;
			
 
				-	printf("On application: this is kernel cpu_mult_2\n\n");
			
 
				-	/* .blas.ptr gives a pointer to the first element of the local copy */
			
 
				-	subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-	subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				-
			
 
				-	nxC = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				-	nyC = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				-	nyA = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-
			
 
				-	ldA = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-	ldB = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				-	ldC = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-	/* we assume a FORTRAN-ordering! */
			
 
				-	unsigned i,j,k;
			
 
				-	for (j = 0; j < nxC; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < nyC; i++)
			
 
				-		{
			
 
				-			float sum = 0.0;
			
 
				-
			
 
				-			for (k = 0; k < nyA; k++)
			
 
				-			{
			
 
				-				sum += subA[j+k*ldA]*subB[k+i*ldB];
			
 
				-			}
			
 
				-
			
 
				-			subC[j + i*ldC] = sum;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-static void init_problem_data(void)
			
 
				-{
			
 
				-	unsigned i,j;
			
 
				-
			
 
				-	/* we initialize matrices A, B and C in the usual way */
			
 
				-
			
 
				-	A = malloc(zdim*ydim*sizeof(float));
			
 
				-	B = malloc(xdim*zdim*sizeof(float));
			
 
				-	C = malloc(xdim*ydim*sizeof(float));
			
 
				-
			
 
				-	/* fill the A and B matrices */
			
 
				-	srand(2009);
			
 
				-	for (j=0; j < ydim; j++) {
			
 
				-		for (i=0; i < zdim; i++) {
			
 
				-			A[j+i*ydim] = (float)(starpu_drand48());
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (j=0; j < zdim; j++) {
			
 
				-		for (i=0; i < xdim; i++) {
			
 
				-			B[j+i*zdim] = (float)(starpu_drand48());
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	for (j=0; j < ydim; j++) {
			
 
				-		for (i=0; i < xdim; i++) {
			
 
				-			C[j+i*ydim] = (float)(0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void partition_mult_data(void)
			
 
				-{
			
 
				-	/* note that we assume a FORTRAN ordering here! */
			
 
				-
			
 
				-	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A,
			
 
				-		ydim, ydim, zdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&B_handle, 0, (uintptr_t)B,
			
 
				-		zdim, zdim, xdim, sizeof(float));
			
 
				-	starpu_matrix_data_register(&C_handle, 0, (uintptr_t)C,
			
 
				-		ydim, ydim, xdim, sizeof(float));
			
 
				-
			
 
				-	/* A filter is a method to partition a data into disjoint chunks, it is
			
 
				-	 * described by the means of the "struct starpu_data_filter" structure that
			
 
				-	 * contains a function that is applied on a data handle to partition it
			
 
				-	 * into smaller chunks, and an argument that is passed to the function
			
 
				-	 * (eg. the number of blocks to create here).
			
 
				-	 */
			
 
				-
			
 
				-	struct starpu_data_filter vert = {
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				-		.nchildren = nslicesx,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				-	};
			
 
				-
			
 
				-	struct starpu_data_filter horiz = {
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				-		.nchildren = nslicesy,
			
 
				-		.get_nchildren = NULL,
			
 
				-		.get_child_ops = NULL
			
 
				-	};
			
 
				-
			
 
				-/*
			
 
				- *	Illustration with nslicex = 4 and nslicey = 2, it is possible to access
			
 
				- *	sub-data by using the "starpu_data_get_sub_data" method, which takes a data handle,
			
 
				- *	the number of filters to apply, and the indexes for each filters, for
			
 
				- *	instance:
			
 
				- *
			
 
				- *		A' handle is starpu_data_get_sub_data(A_handle, 1, 1);
			
 
				- *		B' handle is starpu_data_get_sub_data(B_handle, 1, 2);
			
 
				- *		C' handle is starpu_data_get_sub_data(C_handle, 2, 2, 1);
			
 
				- *
			
 
				- *	Note that here we applied 2 filters recursively onto C.
			
 
				- *
			
 
				- *	"starpu_data_get_sub_data(C_handle, 1, 3)" would return a handle to the 4th column
			
 
				- *	of blocked matrix C for example.
			
 
				- *
			
 
				- *		              |---|---|---|---|
			
 
				- *		              |   |   | B'|   | B
			
 
				- *		              |---|---|---|---|
			
 
				- *		                0   1   2   3
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		     |    |   |   |   |   |   |
			
 
				- *		     |    | 0 |   |   |   |   |
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		     | A' |   |   |   | C'|   |
			
 
				- *		     |    |   |   |   |   |   |
			
 
				- *		     |----|   |---|---|---|---|
			
 
				- *		       A              C
			
 
				- *
			
 
				- *	IMPORTANT: applying filters is equivalent to partitionning a piece of
			
 
				- *	data in a hierarchical manner, so that memory consistency is enforced
			
 
				- *	for each of the elements independantly. The tasks should therefore NOT
			
 
				- *	access inner nodes (eg. one column of C or the whole C) but only the
			
 
				- *	leafs of the tree (ie. blocks here). Manipulating inner nodes is only
			
 
				- *	possible by disapplying the filters (using starpu_data_unpartition), to
			
 
				- *	enforce memory consistency.
			
 
				- */
			
 
				-
			
 
				-	starpu_data_partition(B_handle, &vert);
			
 
				-	starpu_data_partition(A_handle, &horiz);
			
 
				-
			
 
				-	/* starpu_data_map_filters is a variable-arity function, the first argument
			
 
				-	 * is the handle of the data to partition, the second argument is the
			
 
				-	 * number of filters to apply recursively. Filters are applied in the
			
 
				-	 * same order as the arguments.
			
 
				-	 * This would be equivalent to starpu_data_partition(C_handle, &vert) and
			
 
				-	 * then applying horiz on each sub-data (ie. each column of C)
			
 
				-	 */
			
 
				-	starpu_data_map_filters(C_handle, 2, &vert, &horiz);
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel_t starpu_dgemm_model_common = {
			
 
				-	.cost_model = mult_gemm_cost,
			
 
				-	.type = STARPU_HISTORY_BASED,//STARPU_COMMON, //STARPU_PER_ARCH,
			
 
				-	.symbol = "mult_perf_model"
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				-static struct starpu_perfmodel_t mult_perf_model = {
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "mult_perf_model"
			
 
				-};
			
 
				-*/
			
 
				-
			
 
				-struct starpu_conf conf = {
			
 
				-		.sched_policy_name = "heft",
			
 
				-		.calibrate = 1,
			
 
				-		.ncpus = 4
			
 
				-};
			
 
				-
			
 
				-
			
 
				-static starpu_codelet cl = {
			
 
				-        /* we can only execute that kernel on a CPU yet */
			
 
				-        .where = STARPU_CPU,
			
 
				-        //.starpu_impl_multiple = 1,
			
 
				-        /* CPU implementation of the codelet */
			
 
				-        .cpu_func = STARPU_MULTIPLE_CPU_IMPLEMENTATIONS,
			
 
				-        .cpu_funcs = {cpu_mult,cpu_mult_2},
			
 
				-        /* the codelet manipulates 3 buffers that are managed by the
			
 
				-         * DSM */
			
 
				-        .nbuffers = 3,
			
 
				-        /* in case the scheduling policy may use performance models */
			
 
				-        .model = &starpu_dgemm_model_common
			
 
				-};
			
 
				-
			
 
				-static void launch_tasks(void)
			
 
				-{
			
 
				-	/* partition the work into slices */
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	for (taskx = 0; taskx < nslicesx; taskx++)
			
 
				-	{
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++)
			
 
				-		{
			
 
				-			/* C[taskx, tasky] = A[tasky] B[taskx] */
			
 
				-
			
 
				-			/* by default, starpu_task_create() returns an
			
 
				- 			 * asynchronous task (ie. task->synchronous = 0) */
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-			/* this task implements codelet "cl" */
			
 
				-			task->cl = &cl;
			
 
				-
			
 
				-			/*
			
 
				-			 *              |---|---|---|---|
			
 
				-			 *              |   | * |   |   | B
			
 
				-			 *              |---|---|---|---|
			
 
				-			 *                    X
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *     |****| Y |   |***|   |   |
			
 
				-			 *     |****|   |   |***|   |   |
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *     |    |   |   |   |   |   |
			
 
				-			 *     |    |   |   |   |   |   |
			
 
				-			 *     |----|   |---|---|---|---|
			
 
				-			 *       A              C
			
 
				-			 */
			
 
				-
			
 
				-			/* there was a single filter applied to matrices A
			
 
				-			 * (respectively B) so we grab the handle to the chunk
			
 
				-			 * identified by "tasky" (respectively "taskx). The "1"
			
 
				-			 * tells StarPU that there is a single argument to the
			
 
				-			 * variable-arity function starpu_data_get_sub_data */
			
 
				-			task->buffers[0].handle = starpu_data_get_sub_data(A_handle, 1, tasky);
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				-			task->buffers[1].handle = starpu_data_get_sub_data(B_handle, 1, taskx);
			
 
				-			task->buffers[1].mode = STARPU_R;
			
 
				-
			
 
				-			/* 2 filters were applied on matrix C, so we give
			
 
				-			 * starpu_data_get_sub_data 2 arguments. The order of the arguments
			
 
				-			 * must match the order in which the filters were
			
 
				-			 * applied.
			
 
				-			 * NB: starpu_data_get_sub_data(C_handle, 1, k) would have returned
			
 
				-			 * a handle to the column number k of matrix C.
			
 
				-			 * NB2: starpu_data_get_sub_data(C_handle, 2, taskx, tasky) is
			
 
				-			 * equivalent to
			
 
				-			 * starpu_data_get_sub_data(starpu_data_get_sub_data(C_handle, 1, taskx), 1, tasky)*/
			
 
				-			task->buffers[2].handle = starpu_data_get_sub_data(C_handle, 2, taskx, tasky);
			
 
				-			task->buffers[2].mode = STARPU_W;
			
 
				-
			
 
				-			/* this is not a blocking call since task->synchronous = 0 */
			
 
				-			int summit_task;
			
 
				-			summit_task = starpu_task_submit(task);
			
 
				-			printf("task is submmited or not %d\n",summit_task);
			
 
				-
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(void)
			
 
				-{
			
 
				-	/* start the runtime */
			
 
				-	starpu_init(&conf);
			
 
				-
			
 
				-	/* initialize matrices A, B and C and register them to StarPU */
			
 
				-	init_problem_data();
			
 
				-
			
 
				-	/* partition matrices into blocks that can be manipulated by the
			
 
				- 	 * codelets */
			
 
				-	partition_mult_data();
			
 
				-
			
 
				-	/* submit all tasks in an asynchronous fashion */
			
 
				-	launch_tasks();
			
 
				-
			
 
				-	/* wait for termination */
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	/* remove the filters applied by the means of starpu_data_map_filters; now
			
 
				- 	 * it's not possible to manipulate a subset of C using starpu_data_get_sub_data until
			
 
				-	 * starpu_data_map_filters is called again on C_handle.
			
 
				-	 * The second argument is the memory node where the different subsets
			
 
				-	 * should be reassembled, 0 = main memory (RAM) */
			
 
				-	starpu_data_unpartition(C_handle, 0);
			
 
				-
			
 
				-	/* stop monitoring matrix C : after this, it is not possible to pass C
			
 
				-	 * (or any subset of C) as a codelet input/output. This also implements
			
 
				-	 * a barrier so that the piece of data is put back into main memory in
			
 
				-	 * case it was only available on a GPU for instance. */
			
 
				-	starpu_data_unregister(C_handle);
			
 
				-
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/basic_examples/multiformat.c
+++ b/examples/basic_examples/multiformat.c
@@ -0,0 +1,329 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011-2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+#include "multiformat_types.h"
			
 
				+
			
 
				+static int ncpu = 0;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static int ncuda = 0;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int nopencl = 0;
			
 
				+#endif
			
 
				+
			
 
				+static struct point array_of_structs[N_ELEMENTS];
			
 
				+static starpu_data_handle_t array_of_structs_handle;
			
 
				+
			
 
				+static void
			
 
				+multiformat_scal_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	struct point *aos;
			
 
				+	unsigned int n, i;
			
 
				+
			
 
				+	aos = (struct point *) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+	n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		aos[i].x *= aos[i].y;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern struct starpu_codelet cpu_to_cuda_cl;
			
 
				+extern struct starpu_codelet cuda_to_cpu_cl;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern struct starpu_codelet cpu_to_opencl_cl;
			
 
				+extern struct starpu_codelet opencl_to_cpu_cl;
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_multiformat_data_interface_ops format_ops =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_elemsize = 2* sizeof(float),
			
 
				+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
			
 
				+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_elemsize = 2 * sizeof(float),
			
 
				+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				+	.cpu_elemsize = sizeof(struct point),
			
 
				+
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void multiformat_scal_cuda_func(void *buffers[], void *arg);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+static struct starpu_codelet cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {multiformat_scal_cpu_func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "codelet_real"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_CPU */
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static struct starpu_codelet cuda_cl =
			
 
				+{
			
 
				+	.where = STARPU_CUDA,
			
 
				+	.cuda_funcs = { multiformat_scal_cuda_func, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "cuda_codelet"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static struct starpu_codelet opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = { multiformat_scal_opencl_func, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "opencl_codelet"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+/*
			
 
				+ * Main functions 
			
 
				+ */
			
 
				+static void
			
 
				+init_problem_data(void)
			
 
				+{
			
 
				+	int i; 
			
 
				+	for (i = 0; i < N_ELEMENTS; i++)
			
 
				+	{
			
 
				+		array_of_structs[i].x = 1.0 + i;
			
 
				+		array_of_structs[i].y = 42.0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+register_data(void)
			
 
				+{
			
 
				+	starpu_multiformat_data_register(&array_of_structs_handle,
			
 
				+					 0,
			
 
				+					 &array_of_structs,
			
 
				+					 N_ELEMENTS,
			
 
				+					 &format_ops);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+create_and_submit_task(unsigned int dev)
			
 
				+{
			
 
				+	struct starpu_task *task = starpu_task_create();
			
 
				+	switch (dev)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+		case STARPU_CPU:
			
 
				+			task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA:
			
 
				+			task->cl = &cuda_cl;
			
 
				+			break;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL:
			
 
				+			task->cl = &opencl_cl;
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+	task->synchronous = 1;
			
 
				+	task->handles[0] = array_of_structs_handle;
			
 
				+	task->cl_arg = NULL;
			
 
				+	task->cl_arg_size = 0;
			
 
				+	return starpu_task_submit(task);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+create_and_submit_tasks(void)
			
 
				+{
			
 
				+	int err;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	if (ncuda > 0)
			
 
				+	{
			
 
				+		err = create_and_submit_task(STARPU_CUDA);
			
 
				+		if (err != 0)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "Cuda : %s\n", strerror(-err));
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	if (ncpu > 0)
			
 
				+	{
			
 
				+		err = create_and_submit_task(STARPU_CPU);
			
 
				+		if (err != 0)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "CPU : %s\n", strerror(-err));
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	if (nopencl > 0)
			
 
				+	{
			
 
				+		err = create_and_submit_task(STARPU_OPENCL);
			
 
				+		if (err != 0)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "OpenCL : %s\n", strerror(-err));
			
 
				+			return;
			
 
				+		}
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(array_of_structs_handle);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+print_it(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N_ELEMENTS; i++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "(%.2f %.2f) ",
			
 
				+			array_of_structs[i].x,
			
 
				+			array_of_structs[i].y);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+check_it(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N_ELEMENTS; i++)
			
 
				+	{
			
 
				+		float expected_value = i + 1.0;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		if (ncuda > 0)
			
 
				+			expected_value *= array_of_structs[i].y;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		if (nopencl > 0)
			
 
				+			expected_value *= array_of_structs[i].y;
			
 
				+#endif
			
 
				+		expected_value *= array_of_structs[i].y;
			
 
				+		if (array_of_structs[i].x != expected_value)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program opencl_program;
			
 
				+struct starpu_opencl_program opencl_conversion_program;
			
 
				+#endif
			
 
				+
			
 
				+static int
			
 
				+gpus_available()
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	if (ncuda > 0)
			
 
				+		return 1;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	if (nopencl > 0)
			
 
				+		return 1;
			
 
				+#endif
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+#ifdef STARPU_USE_CPU
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	ncpu = starpu_cpu_worker_get_count();
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	ncuda = starpu_cuda_worker_get_count();
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	nopencl = starpu_opencl_worker_get_count();
			
 
				+#endif
			
 
				+
			
 
				+	if (ncpu == 0 || !gpus_available())
			
 
				+	{
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_opencl_kernel.cl",
			
 
				+						  &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/multiformat_conversion_codelets_opencl_kernel.cl", 
			
 
				+						  &opencl_conversion_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+#endif
			
 
				+	init_problem_data();
			
 
				+
			
 
				+	print_it();
			
 
				+
			
 
				+	register_data();
			
 
				+
			
 
				+	create_and_submit_tasks();
			
 
				+
			
 
				+	unregister_data();
			
 
				+
			
 
				+	print_it();
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+        starpu_opencl_unload_opencl(&opencl_program);
			
 
				+        starpu_opencl_unload_opencl(&opencl_conversion_program);
			
 
				+#endif
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+
			
 
				+	return check_it();
			
 
				+#else
			
 
				+	/* Without the CPU, there is no point in using the multiformat
			
 
				+	 * interface, so this test is pointless. */
			
 
				+	return 77;
			
 
				+#endif
			
 
				+}
			
--- a/examples/basic_examples/multiformat_conversion_codelets.c
+++ b/examples/basic_examples/multiformat_conversion_codelets.c
@@ -0,0 +1,81 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "multiformat_types.h"
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void cuda_to_cpu(void *buffers[], void *arg)
			
 
				+{
			
 
				+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
			
 
				+	struct point *dst = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+	int i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		dst[i].x = src->x[i];
			
 
				+		dst[i].y = src->y[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet cpu_to_cuda_cl =
			
 
				+{
			
 
				+	.where = STARPU_CUDA,
			
 
				+	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cpu_to_cuda"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cuda_to_cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {cuda_to_cpu, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cude_to_cpu"
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_to_cpu(void *buffers[], void *arg)
			
 
				+{
			
 
				+	FPRINTF(stderr, "User Entering %s\n", __func__);
			
 
				+	struct struct_of_arrays *src = STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
			
 
				+	struct point *dst = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+	int i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		dst[i].x = src->x[i];
			
 
				+		dst[i].y = src->y[i];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet cpu_to_opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet opencl_to_cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {opencl_to_cpu, NULL},
			
 
				+	.nbuffers = 1
			
 
				+};
			
 
				+#endif
			
--- a/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
+++ b/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
@@ -0,0 +1,50 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "multiformat_types.h"
			
 
				+
			
 
				+static __global__ void cpu_to_cuda_cuda(struct point *src,
			
 
				+	struct struct_of_arrays *dst, unsigned n)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i < n)
			
 
				+	{
			
 
				+		dst->x[i] = src[i].x;
			
 
				+		dst->y[i] = src[i].y;
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+	struct point *src;
			
 
				+	struct struct_of_arrays *dst;
			
 
				+
			
 
				+	src = (struct point *) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+	dst = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
			
 
				+
			
 
				+	int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+
			
 
				+        cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/examples/basic_examples/multiformat_conversion_codelets_opencl.c
+++ b/examples/basic_examples/multiformat_conversion_codelets_opencl.c
@@ -0,0 +1,99 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_conversion_program;
			
 
				+
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+	cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+	cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_conversion_program,
			
 
				+					"cpu_to_opencl_opencl",
			
 
				+					devid);
			
 
				+
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 0, sizeof(src), &src);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 1, sizeof(dst), &dst);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 2, sizeof(n), &n);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					kernel,
			
 
				+					1,
			
 
				+					NULL,
			
 
				+					&global,
			
 
				+					&local,
			
 
				+					0,
			
 
				+					NULL,
			
 
				+					&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/starpufft/cudaf_kernels.cu
+++ b/examples/starpufft/cudaf_kernels.cu
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,5 +14,16 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "float.h"
			
 
				-#include "cudax_kernels.cu"
			
 
				+#include "multiformat_types.h"
			
 
				+
			
 
				+__kernel void cpu_to_opencl_opencl(__global struct point *src,
			
 
				+				   __global struct struct_of_arrays *dst,
			
 
				+				   unsigned int n)
			
 
				+{
			
 
				+	const unsigned int i = get_global_id(0);
			
 
				+	if (i < n)
			
 
				+	{
			
 
				+		dst->x[i] = src[i].x;
			
 
				+		dst->y[i] = src[i].y;
			
 
				+	}
			
 
				+}
			
--- a/examples/basic_examples/multiformat_cuda.cu
+++ b/examples/basic_examples/multiformat_cuda.cu
@@ -0,0 +1,43 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "multiformat_types.h"
			
 
				+
			
 
				+static __global__ void multiformat_cuda(struct struct_of_arrays *soa, unsigned n)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i < n)
			
 
				+		soa->x[i] *= soa->y[i];
			
 
				+}
			
 
				+
			
 
				+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+	(void) _args;
			
 
				+
			
 
				+	FPRINTF(stderr, "Running the cuda kernel (%s)\n", __func__);
			
 
				+	unsigned int n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+	struct struct_of_arrays *soa;
			
 
				+
			
 
				+	soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+        multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/examples/basic_examples/multiformat_opencl.c
+++ b/examples/basic_examples/multiformat_opencl.c
@@ -0,0 +1,92 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void multiformat_scal_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+	cl_mem val = (cl_mem)STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"multiformat_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err  = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	err = clSetKernelArg(kernel, 1, sizeof(n), &n);
			
 
				+	if (err)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(queue,
			
 
				+					kernel,
			
 
				+					1,
			
 
				+					NULL,
			
 
				+					&global,
			
 
				+					&local,
			
 
				+					0,
			
 
				+					NULL,
			
 
				+					&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/starpufft/starpufft_common.c
+++ b/examples/starpufft/starpufft_common.c
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,7 +14,11 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "starpufft.h"
			
 
				+#include "multiformat_types.h"
			
 
				 
			
 
				-/* Used as an identifier in starpu tags to let plans run concurrently */
			
 
				-int starpufft_last_plan_number;
			
 
				+__kernel void multiformat_opencl(__global struct struct_of_arrays *soa, int nx)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+        if (i < nx)
			
 
				+		soa->x[i] *= soa->y[i];
			
 
				+}
			
--- a/examples/basic_examples/multiformat_types.h
+++ b/examples/basic_examples/multiformat_types.h
@@ -0,0 +1,33 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#ifndef MULTIFORMAT_TYPES_H
			
 
				+#define MULTIFORMAT_TYPES_H
			
 
				+
			
 
				+#define N_ELEMENTS 10
			
 
				+
			
 
				+struct struct_of_arrays
			
 
				+{
			
 
				+	float x[N_ELEMENTS];
			
 
				+	float y[N_ELEMENTS];
			
 
				+};
			
 
				+struct point
			
 
				+{
			
 
				+	float x, y;
			
 
				+};
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+#endif
			
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -38,10 +38,13 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	unsigned i;
			
 
				         float foo;
			
 
				-	starpu_data_handle float_array_handle;
			
 
				-	starpu_codelet cl = {};
			
 
				+	starpu_data_handle_t float_array_handle;
			
 
				+	struct starpu_codelet cl = {};
			
 
				+	int ret;
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 #ifdef STARPU_SLOW_MACHINE
			
 
				 	niter /= 100;
			
@@ -53,18 +56,20 @@ int main(int argc, char **argv)
 
				                                       (uintptr_t)&foo, sizeof(float));
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
			
 
				+        ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/variable_kernels_opencl_kernel.cl", &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				 	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
 
				-        cl.cpu_func = cpu_codelet;
			
 
				+        cl.cpu_funcs[0] = cpu_codelet;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-        cl.cuda_func = cuda_codelet;
			
 
				+        cl.cuda_funcs[0] = cuda_codelet;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-        cl.opencl_func = opencl_codelet;
			
 
				+        cl.opencl_funcs[0] = opencl_codelet;
			
 
				 #endif
			
 
				         cl.nbuffers = 1;
			
 
				+	cl.modes[0] = STARPU_RW;
			
 
				         cl.model = NULL;
			
 
				 
			
 
				 	for (i = 0; i < niter; i++)
			
@@ -76,8 +81,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 		task->callback_func = NULL;
			
 
				 
			
 
				-		task->buffers[0].handle = float_array_handle;
			
 
				-		task->buffers[0].mode = STARPU_RW;
			
 
				+		task->handles[0] = float_array_handle;
			
 
				 
			
 
				 		ret = starpu_task_submit(task);
			
 
				 		if (STARPU_UNLIKELY(ret == -ENODEV))
			
@@ -97,4 +101,8 @@ int main(int argc, char **argv)
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/basic_examples/variable_kernels_opencl.c
+++ b/examples/basic_examples/variable_kernels_opencl.c
@@ -33,7 +33,6 @@ void opencl_codelet(void *descr[], void *_args)
 
				 	err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable", devid);
			
 
				 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
 
				-	err = 0;
			
 
				 	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
			
 
				 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				 
			
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2010, 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -19,44 +19,64 @@
 
				  * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				  * It shows how to manipulate data with StarPU's data management library.
			
 
				  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				- *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				+ *  2- how to describe which data are accessed by a task (task->handles[0])
			
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_opencl.h>
			
 
				+#include <stdlib.h>
			
 
				 #include <stdio.h>
			
 
				+#include <math.h>
			
 
				 
			
 
				-#define	NX	2048
			
 
				+#define	NX	204800
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				 extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				+extern void scal_cpu_func_icc(void *buffers[], void *_args);
			
 
				+extern void scal_sse_func(void *buffers[], void *_args);
			
 
				+extern void scal_sse_func_icc(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				-static struct starpu_perfmodel_t vector_scal_model = {
			
 
				+static struct starpu_perfmodel vector_scal_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "vector_scale"
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t vector_scal_power_model = {
			
 
				+static struct starpu_perfmodel vector_scal_power_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "vector_scale_power"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				 	/* CPU implementation of the codelet */
			
 
				-	.cpu_func = scal_cpu_func,
			
 
				+	.cpu_funcs = {
			
 
				+		scal_cpu_func
			
 
				+#ifdef STARPU_HAVE_ICC
			
 
				+		, scal_cpu_func_icc
			
 
				+#endif
			
 
				+#ifdef __SSE__
			
 
				+		, scal_sse_func
			
 
				+#ifdef STARPU_HAVE_ICC
			
 
				+		, scal_sse_func_icc
			
 
				+#endif
			
 
				+#endif
			
 
				+	},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* CUDA implementation of the codelet */
			
 
				-	.cuda_func = scal_cuda_func,
			
 
				+	.cuda_funcs = {scal_cuda_func, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	/* OpenCL implementation of the codelet */
			
 
				-	.opencl_func = scal_opencl_func,
			
 
				+	.opencl_funcs = {scal_opencl_func, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 	.model = &vector_scal_model,
			
 
				 	.power_model = &vector_scal_power_model
			
 
				 };
			
@@ -65,6 +85,13 @@ static starpu_codelet cl = {
 
				 struct starpu_opencl_program opencl_program;
			
 
				 #endif
			
 
				 
			
 
				+static int approximately_equal(float a, float b)
			
 
				+{
			
 
				+	int ai = (int) nearbyintf(a * 1000.0);
			
 
				+	int bi = (int) nearbyintf(b * 1000.0);
			
 
				+	return ai == bi;
			
 
				+}
			
 
				+
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	/* We consider a vector of float that is initialized just as any of C
			
@@ -74,15 +101,17 @@ int main(int argc, char **argv)
 
				 	for (i = 0; i < NX; i++)
			
 
				                 vector[i] = (i+1.0f);
			
 
				 
			
 
				-	FPRINTF(stderr, "BEFORE: First element was %f\n", vector[0]);
			
 
				-	FPRINTF(stderr, "BEFORE: Last element was %f\n", vector[NX-1]);
			
 
				-
			
 
				 	/* Initialize StarPU with default configuration */
			
 
				-	starpu_init(NULL);
			
 
				+	int ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+
			
 
				+	FPRINTF(stderr, "[BEFORE] 1-th element    : %3.2f\n", vector[1]);
			
 
				+	FPRINTF(stderr, "[BEFORE] (NX-1)th element: %3.2f\n", vector[NX-1]);
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
			
 
				-					    &opencl_program, NULL);
			
 
				+	ret = starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
			
 
				+						  &opencl_program, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
@@ -98,7 +127,7 @@ int main(int argc, char **argv)
 
				 	 *  - the fourth argument is the number of elements in the vector
			
 
				 	 *  - the fifth argument is the size of each element.
			
 
				 	 */
			
 
				-	starpu_data_handle vector_handle;
			
 
				+	starpu_data_handle_t vector_handle;
			
 
				 	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 
			
 
				 	float factor = 3.14;
			
@@ -111,8 +140,7 @@ int main(int argc, char **argv)
 
				 	task->cl = &cl;
			
 
				 
			
 
				 	/* the codelet manipulates one buffer in RW mode */
			
 
				-	task->buffers[0].handle = vector_handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = vector_handle;
			
 
				 
			
 
				 	/* an argument is passed to the codelet, beware that this is a
			
 
				 	 * READ-ONLY buffer and that the codelet may be given a pointer to a
			
@@ -121,14 +149,13 @@ int main(int argc, char **argv)
 
				 	task->cl_arg_size = sizeof(factor);
			
 
				 
			
 
				 	/* execute the task on any eligible computational ressource */
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	/* StarPU does not need to manipulate the array anymore so we can stop
			
 
				  	 * monitoring it */
			
 
				 	starpu_data_unregister(vector_handle);
			
 
				 
			
 
				-	starpu_task_destroy(task);
			
 
				-
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         starpu_opencl_unload_opencl(&opencl_program);
			
 
				 #endif
			
@@ -136,8 +163,14 @@ int main(int argc, char **argv)
 
				 	/* terminate StarPU, no task can be submitted after */
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	FPRINTF(stderr, "AFTER: First element is %f\n", vector[0]);
			
 
				-	FPRINTF(stderr, "AFTER: Last element is %f\n", vector[NX-1]);
			
 
				+	FPRINTF(stderr, "[AFTER] 1-th element     : %3.2f (should be %3.2f)\n", vector[1], (1+1.0f) * factor);
			
 
				+	FPRINTF(stderr, "[AFTER] (NX-1)-th element: %3.2f (should be %3.2f)\n", vector[NX-1], (NX-1+1.0f) * factor);
			
 
				+
			
 
				+	return ((approximately_equal(vector[1], (1+1.0f) * factor)
			
 
				+		 && approximately_equal(vector[NX-1], (NX-1+1.0f) * factor))
			
 
				+		? EXIT_SUCCESS
			
 
				+		: EXIT_FAILURE);
			
 
				 
			
 
				-	return 0;
			
 
				+enodev:
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -19,7 +19,7 @@
 
				  * This example demonstrates how to use StarPU to scale an array by a factor.
			
 
				  * It shows how to manipulate data with StarPU's data management library.
			
 
				  *  1- how to declare a piece of data to StarPU (starpu_vector_data_register)
			
 
				- *  2- how to describe which data are accessed by a task (task->buffers[0])
			
 
				+ *  2- how to describe which data are accessed by a task (task->handles[0])
			
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  *
			
 
				  * This is a variant of vector_scal.c which shows it can be integrated with fortran.
			
@@ -33,18 +33,21 @@
 
				 extern void scal_cpu_func(void *buffers[], void *_args);
			
 
				 extern void scal_cuda_func(void *buffers[], void *_args);
			
 
				 
			
 
				-static struct starpu_perfmodel_t vector_scal_model = {
			
 
				+static struct starpu_perfmodel vector_scal_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "vector_scale_model"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				-  .where = STARPU_CPU | STARPU_CUDA,
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.where = STARPU_CPU | STARPU_CUDA,
			
 
				 	/* CPU implementation of the codelet */
			
 
				-	.cpu_func = scal_cpu_func,
			
 
				+	.cpu_funcs = {scal_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	/* CUDA implementation of the codelet */
			
 
				-	.cuda_func = scal_cuda_func,
			
 
				+	.cuda_funcs = {scal_cuda_func, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &vector_scal_model
			
@@ -53,9 +56,11 @@ static starpu_codelet cl = {
 
				 void compute_(int *F_NX, float *vector)
			
 
				 {
			
 
				         int NX = *F_NX;
			
 
				-	
			
 
				+	int ret;
			
 
				+
			
 
				 	/* Initialize StarPU with default configuration */
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Tell StaPU to associate the "vector" vector with the "vector_handle"
			
 
				 	 * identifier. When a task needs to access a piece of data, it should
			
@@ -70,7 +75,7 @@ void compute_(int *F_NX, float *vector)
 
				 	 *  - the fourth argument is the number of elements in the vector
			
 
				 	 *  - the fifth argument is the size of each element.
			
 
				 	 */
			
 
				-	starpu_data_handle vector_handle;
			
 
				+	starpu_data_handle_t vector_handle;
			
 
				 	starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
			
 
				 
			
 
				 	float factor = 3.14;
			
@@ -83,8 +88,7 @@ void compute_(int *F_NX, float *vector)
 
				 	task->cl = &cl;
			
 
				 
			
 
				 	/* the codelet manipulates one buffer in RW mode */
			
 
				-	task->buffers[0].handle = vector_handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = vector_handle;
			
 
				 
			
 
				 	/* an argument is passed to the codelet, beware that this is a
			
 
				 	 * READ-ONLY buffer and that the codelet may be given a pointer to a
			
@@ -93,7 +97,8 @@ void compute_(int *F_NX, float *vector)
 
				 	task->cl_arg_size = sizeof(factor);
			
 
				 
			
 
				 	/* execute the task on any eligible computational ressource */
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	/* StarPU does not need to manipulate the array anymore so we can stop
			
 
				  	 * monitoring it */
			
--- a/examples/basic_examples/vector_scal_cpu.c
+++ b/examples/basic_examples/vector_scal_cpu.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,6 +19,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef __SSE__
			
 
				+#include <xmmintrin.h>
			
 
				+#endif
			
 
				 
			
 
				 /* This kernel takes a buffer and scales it by a constant factor */
			
 
				 void scal_cpu_func(void *buffers[], void *cl_arg)
			
@@ -27,17 +30,17 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
				 	float *factor = (float *) cl_arg;
			
 
				 
			
 
				 	/*
			
 
				-	 * The "buffers" array matches the task->buffers array: for instance
			
 
				-	 * task->buffers[0].handle is a handle that corresponds to a data with
			
 
				+	 * The "buffers" array matches the task->handles array: for instance
			
 
				+	 * task->handles[0] is a handle that corresponds to a data with
			
 
				 	 * vector "interface", so that the first entry of the array in the
			
 
				 	 * codelet  is a pointer to a structure describing such a vector (ie.
			
 
				-	 * struct starpu_vector_interface_s *). Here, we therefore manipulate
			
 
				+	 * struct starpu_vector_interface *). Here, we therefore manipulate
			
 
				 	 * the buffers[0] element as a vector: nx gives the number of elements
			
 
				 	 * in the array, ptr gives the location of the array (that was possibly
			
 
				 	 * migrated/replicated), and elemsize gives the size of each elements.
			
 
				 	 */
			
 
				 
			
 
				-	starpu_vector_interface_t *vector = (starpu_vector_interface_t *) buffers[0];
			
 
				+	struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
			
 
				 
			
 
				 	/* length of the vector */
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(vector);
			
@@ -52,3 +55,30 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 
				 		val[i] *= *factor;
			
 
				 }
			
 
				 
			
 
				+#ifdef __SSE__
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+	float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+	unsigned int n_iterations = n/4;
			
 
				+
			
 
				+	__m128 *VECTOR = (__m128*) vector;
			
 
				+	__m128 FACTOR __attribute__((aligned(16)));
			
 
				+	float factor = *(float *) cl_arg;
			
 
				+	FACTOR = _mm_set1_ps(factor);
			
 
				+
			
 
				+	unsigned int i;	
			
 
				+	for (i = 0; i < n_iterations; i++)
			
 
				+		VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
			
 
				+
			
 
				+	unsigned int remainder = n%4;
			
 
				+	if (remainder != 0)
			
 
				+	{
			
 
				+		unsigned int start = 4 * n_iterations;
			
 
				+		for (i = start; i < start+remainder; ++i)
			
 
				+		{
			
 
				+			vector[i] = factor * vector[i];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
--- a/examples/basic_examples/vector_scal_cpu_icc.icc
+++ b/examples/basic_examples/vector_scal_cpu_icc.icc
@@ -0,0 +1 @@
 
				+vector_scal_cpu.c
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -37,7 +37,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				 	/* length of the vector */
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				 	/* OpenCL copy of the vector pointer */
			
 
				-	cl_mem val = (cl_mem)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				 
			
 
				 	id = starpu_worker_get_id();
			
 
				 	devid = starpu_worker_get_devid(id);
			
--- a/examples/basic_examples/vector_scal_opencl_kernel.cl
+++ b/examples/basic_examples/vector_scal_opencl_kernel.cl
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,7 +17,8 @@
 
				 __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
			
 
				 {
			
 
				         const int i = get_global_id(0);
			
 
				-        if (i < nx) {
			
 
				+        if (i < nx)
			
 
				+	{
			
 
				                 val[i] *= factor;
			
 
				         }
			
 
				 }
			
--- a/examples/callback/callback.c
+++ b/examples/callback/callback.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,7 +21,7 @@
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 
			
 
				-starpu_data_handle handle;
			
 
				+starpu_data_handle_t handle;
			
 
				 
			
 
				 void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
@@ -30,37 +30,47 @@ void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	*val += 1;
			
 
				 }
			
 
				 
			
 
				-starpu_codelet cl =
			
 
				+struct starpu_codelet cl =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU,
			
 
				-	.cpu_func = cpu_codelet,
			
 
				+	.cpu_funcs = {cpu_codelet, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
 
				 void callback_func(void *callback_arg)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				 	task->cl = &cl;
			
 
				-	task->buffers[0].handle = handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-	starpu_task_submit(task);
			
 
				+	task->handles[0] = handle;
			
 
				+
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int v=40;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				 	starpu_variable_data_register(&handle, 0, (uintptr_t)&v, sizeof(int));
			
 
				 
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				 	task->cl = &cl;
			
 
				 	task->callback_func = callback_func;
			
 
				 	task->callback_arg = NULL;
			
 
				-	task->buffers[0].handle = handle;
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = handle;
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 	starpu_data_unregister(handle);
			
@@ -70,4 +80,8 @@ int main(int argc, char **argv)
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -72,22 +72,22 @@ static int long long n = 1024;
 
				 static int nblocks = 8;
			
 
				 static int use_reduction = 1;
			
 
				 
			
 
				-static starpu_data_handle A_handle, b_handle, x_handle;
			
 
				+static starpu_data_handle_t A_handle, b_handle, x_handle;
			
 
				 static TYPE *A, *b, *x;
			
 
				 
			
 
				 static int i_max = 4000;
			
 
				 static double eps = (10e-14);
			
 
				 
			
 
				-static starpu_data_handle r_handle, d_handle, q_handle;
			
 
				+static starpu_data_handle_t r_handle, d_handle, q_handle;
			
 
				 static TYPE *r, *d, *q;
			
 
				 
			
 
				-static starpu_data_handle dtq_handle, rtr_handle;
			
 
				+static starpu_data_handle_t dtq_handle, rtr_handle;
			
 
				 static TYPE dtq, rtr;
			
 
				 
			
 
				-extern starpu_codelet accumulate_variable_cl;
			
 
				-extern starpu_codelet accumulate_vector_cl;
			
 
				-extern starpu_codelet bzero_variable_cl;
			
 
				-extern starpu_codelet bzero_vector_cl;
			
 
				+extern struct starpu_codelet accumulate_variable_cl;
			
 
				+extern struct starpu_codelet accumulate_vector_cl;
			
 
				+extern struct starpu_codelet bzero_variable_cl;
			
 
				+extern struct starpu_codelet bzero_vector_cl;
			
 
				 
			
 
				 /*
			
 
				  *	Generate Input data
			
@@ -125,6 +125,16 @@ static void generate_random_problem(void)
 
				 	memset(q, 0, n*sizeof(TYPE));
			
 
				 }
			
 
				 
			
 
				+static void free_data(void)
			
 
				+{
			
 
				+	starpu_free(A);
			
 
				+	starpu_free(b);
			
 
				+	starpu_free(x);
			
 
				+	starpu_free(r);
			
 
				+	starpu_free(d);
			
 
				+	starpu_free(q);
			
 
				+}
			
 
				+
			
 
				 static void register_data(void)
			
 
				 {
			
 
				 	starpu_matrix_data_register(&A_handle, 0, (uintptr_t)A, n, n, n, sizeof(TYPE));
			
@@ -148,6 +158,28 @@ static void register_data(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unpartition(A_handle, 0);
			
 
				+	starpu_data_unpartition(b_handle, 0);
			
 
				+	starpu_data_unpartition(x_handle, 0);
			
 
				+
			
 
				+	starpu_data_unpartition(r_handle, 0);
			
 
				+	starpu_data_unpartition(d_handle, 0);
			
 
				+	starpu_data_unpartition(q_handle, 0);
			
 
				+
			
 
				+	starpu_data_unregister(A_handle);
			
 
				+	starpu_data_unregister(b_handle);
			
 
				+	starpu_data_unregister(x_handle);
			
 
				+
			
 
				+	starpu_data_unregister(r_handle);
			
 
				+	starpu_data_unregister(d_handle);
			
 
				+	starpu_data_unregister(q_handle);
			
 
				+
			
 
				+	starpu_data_unregister(dtq_handle);
			
 
				+	starpu_data_unregister(rtr_handle);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  *	Data partitioning filters
			
 
				  */
			
@@ -194,7 +226,7 @@ static void partition_data(void)
 
				  */
			
 
				 
			
 
				 #if 0
			
 
				-static void display_vector(starpu_data_handle handle, TYPE *ptr)
			
 
				+static void display_vector(starpu_data_handle_t handle, TYPE *ptr)
			
 
				 {
			
 
				 	unsigned block_size = n / nblocks;
			
 
				 
			
@@ -230,24 +262,29 @@ static void display_matrix(void)
 
				  *	Main loop
			
 
				  */
			
 
				 
			
 
				-static void cg(void)
			
 
				+static int cg(void)
			
 
				 {
			
 
				 	double delta_new, delta_old, delta_0;
			
 
				 	double alpha, beta;
			
 
				 
			
 
				 	int i = 0;
			
 
				+	int ret;
			
 
				 
			
 
				 	/* r <- b */
			
 
				-	copy_handle(r_handle, b_handle, nblocks);
			
 
				+	ret = copy_handle(r_handle, b_handle, nblocks);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				 
			
 
				 	/* r <- r - A x */
			
 
				-	gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
			
 
				+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				 
			
 
				 	/* d <- r */
			
 
				-	copy_handle(d_handle, r_handle, nblocks);
			
 
				+	ret = copy_handle(d_handle, r_handle, nblocks);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				 
			
 
				 	/* delta_new = dot(r,r) */
			
 
				-	dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				 
			
 
				 	starpu_data_acquire(rtr_handle, STARPU_R);
			
 
				 	delta_new = rtr;
			
@@ -285,7 +322,8 @@ static void cg(void)
 
				 			/* r <- r - A x */
			
 
				 			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction); 
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* r <- r - alpha q */
			
 
				 			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
			
 
				 		}
			
@@ -318,6 +356,7 @@ static void cg(void)
 
				 	double timing = (double)(((double)end.tv_sec - (double)start.tv_sec)*10e6 + ((double)end.tv_usec - (double)start.tv_usec));
			
 
				 	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
			
 
				 	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static int check(void)
			
@@ -328,28 +367,34 @@ static int check(void)
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-	        if (strcmp(argv[i], "-n") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+	        if (strcmp(argv[i], "-n") == 0)
			
 
				+		{
			
 
				 			n = (int long long)atoi(argv[++i]);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-	        if (strcmp(argv[i], "-maxiter") == 0) {
			
 
				+	        if (strcmp(argv[i], "-maxiter") == 0)
			
 
				+		{
			
 
				 			i_max = atoi(argv[++i]);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-	        if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				+	        if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				 			nblocks = atoi(argv[++i]);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-	        if (strcmp(argv[i], "-no-reduction") == 0) {
			
 
				+	        if (strcmp(argv[i], "-no-reduction") == 0)
			
 
				+		{
			
 
				 			use_reduction = 0;
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-	        if (strcmp(argv[i], "-h") == 0) {
			
 
				+	        if (strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				 			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
			
 
				 			exit(-1);
			
 
				 			continue;
			
@@ -361,21 +406,37 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+#ifdef STARPU_SLOW_MACHINE
			
 
				+	i_max = 16;
			
 
				+#endif
			
 
				+
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	generate_random_problem();
			
 
				 	register_data();
			
 
				 	partition_data();
			
 
				 
			
 
				-	cg();
			
 
				+	ret = cg();
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				 
			
 
				 	ret = check();
			
 
				 
			
 
				+	starpu_task_wait_for_all();
			
 
				+	unregister_data();
			
 
				+	free_data();
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return 77;
			
 
				 }
			
--- a/examples/cg/cg.h
+++ b/examples/cg/cg.h
@@ -57,29 +57,29 @@
 
				 #define cublasscal	cublasSscal
			
 
				 #endif
			
 
				 
			
 
				-void dot_kernel(starpu_data_handle v1,
			
 
				-                starpu_data_handle v2,
			
 
				-                starpu_data_handle s,
			
 
				-		unsigned nblocks,
			
 
				-		int use_reduction);
			
 
				+int dot_kernel(starpu_data_handle_t v1,
			
 
				+	       starpu_data_handle_t v2,
			
 
				+	       starpu_data_handle_t s,
			
 
				+	       unsigned nblocks,
			
 
				+	       int use_reduction);
			
 
				 
			
 
				-void gemv_kernel(starpu_data_handle v1,
			
 
				-                starpu_data_handle matrix, 
			
 
				-                starpu_data_handle v2,
			
 
				+int gemv_kernel(starpu_data_handle_t v1,
			
 
				+                starpu_data_handle_t matrix, 
			
 
				+                starpu_data_handle_t v2,
			
 
				                 TYPE p1, TYPE p2,
			
 
				 		unsigned nblocks,
			
 
				 		int use_reduction);
			
 
				 
			
 
				-void axpy_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle v2, TYPE p1,
			
 
				+int axpy_kernel(starpu_data_handle_t v1,
			
 
				+		starpu_data_handle_t v2, TYPE p1,
			
 
				 		unsigned nblocks);
			
 
				 
			
 
				-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
			
 
				-			starpu_data_handle v2, TYPE p2,
			
 
				-			unsigned nblocks);
			
 
				+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
			
 
				+		     starpu_data_handle_t v2, TYPE p2,
			
 
				+		     unsigned nblocks);
			
 
				 
			
 
				-void copy_handle(starpu_data_handle dst,
			
 
				-		starpu_data_handle src,
			
 
				+int copy_handle(starpu_data_handle_t dst,
			
 
				+		starpu_data_handle_t src,
			
 
				 		unsigned nblocks);
			
 
				 
			
 
				 #endif /* __STARPU_EXAMPLE_CG_H__ */
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -68,16 +68,18 @@ static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 
				 	*v_dst = *v_dst + *v_src;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t accumulate_variable_model = {
			
 
				+static struct starpu_perfmodel accumulate_variable_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "accumulate_variable"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet accumulate_variable_cl = {
			
 
				+struct starpu_codelet accumulate_variable_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = accumulate_variable_cpu,
			
 
				+	.cpu_funcs = {accumulate_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = accumulate_variable_cuda,
			
 
				+	.cuda_funcs = {accumulate_variable_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_variable_model
			
@@ -104,16 +106,18 @@ static void accumulate_vector_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t accumulate_vector_model = {
			
 
				+static struct starpu_perfmodel accumulate_vector_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "accumulate_vector"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet accumulate_vector_cl = {
			
 
				+struct starpu_codelet accumulate_vector_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = accumulate_vector_cpu,
			
 
				+	.cpu_funcs = {accumulate_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = accumulate_vector_cuda,
			
 
				+	.cuda_funcs = {accumulate_vector_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &accumulate_vector_model
			
@@ -142,16 +146,18 @@ static void bzero_variable_cpu(void *descr[], void *cl_arg)
 
				 	*v = (TYPE)0.0;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t bzero_variable_model = {
			
 
				+static struct starpu_perfmodel bzero_variable_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "bzero_variable"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet bzero_variable_cl = {
			
 
				+struct starpu_codelet bzero_variable_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = bzero_variable_cpu,
			
 
				+	.cpu_funcs = {bzero_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = bzero_variable_cuda,
			
 
				+	.cuda_funcs = {bzero_variable_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &bzero_variable_model
			
@@ -177,16 +183,18 @@ static void bzero_vector_cpu(void *descr[], void *cl_arg)
 
				 	memset(v, 0, n*sizeof(TYPE));
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t bzero_vector_model = {
			
 
				+static struct starpu_perfmodel bzero_vector_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "bzero_vector"
			
 
				 };
			
 
				 
			
 
				-starpu_codelet bzero_vector_cl = {
			
 
				+struct starpu_codelet bzero_vector_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = bzero_vector_cpu,
			
 
				+	.cpu_funcs = {bzero_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = bzero_vector_cuda,
			
 
				+	.cuda_funcs = {bzero_vector_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &bzero_vector_model
			
@@ -230,39 +238,47 @@ static void dot_kernel_cpu(void *descr[], void *cl_arg)
 
				 	*dot = *dot + local_dot;
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t dot_kernel_model = {
			
 
				+static struct starpu_perfmodel dot_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "dot_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet dot_kernel_cl = {
			
 
				+static struct starpu_codelet dot_kernel_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = dot_kernel_cpu,
			
 
				+	.cpu_funcs = {dot_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = dot_kernel_cuda,
			
 
				+	.cuda_funcs = {dot_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &dot_kernel_model
			
 
				 };
			
 
				 
			
 
				-void dot_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle v2,
			
 
				-		starpu_data_handle s,
			
 
				-		unsigned nblocks,
			
 
				-		int use_reduction)
			
 
				+int dot_kernel(starpu_data_handle_t v1,
			
 
				+	       starpu_data_handle_t v2,
			
 
				+	       starpu_data_handle_t s,
			
 
				+	       unsigned nblocks,
			
 
				+	       int use_reduction)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* Blank the accumulation variable */
			
 
				-	starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+	ret = starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+	if (ret == -ENODEV) return ret;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&dot_kernel_cl,
			
 
				-			use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				-			STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&dot_kernel_cl,
			
 
				+					 use_reduction?STARPU_REDUX:STARPU_RW, s,
			
 
				+					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -273,7 +289,7 @@ void dot_kernel(starpu_data_handle v1,
 
				 static void scal_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -288,7 +304,7 @@ static void scal_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void scal_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE alpha;
			
 
				-	starpu_unpack_cl_args(cl_arg, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &alpha);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
@@ -297,16 +313,18 @@ static void scal_kernel_cpu(void *descr[], void *cl_arg)
 
				 	SCAL(n, alpha, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t scal_kernel_model = {
			
 
				+static struct starpu_perfmodel scal_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "scal_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet scal_kernel_cl = {
			
 
				+static struct starpu_codelet scal_kernel_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = scal_kernel_cpu,
			
 
				+	.cpu_funcs = {scal_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = scal_kernel_cuda,
			
 
				+	.cuda_funcs = {scal_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &scal_kernel_model
			
@@ -328,7 +346,7 @@ static void gemv_kernel_cuda(void *descr[], void *cl_arg)
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				  
			
 
				 	TYPE alpha, beta;
			
 
				-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	/* Compute v1 = alpha M v2 + beta v1 */
			
 
				 	cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
@@ -347,7 +365,7 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
				 	unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 
			
 
				 	TYPE alpha, beta;
			
 
				-	starpu_unpack_cl_args(cl_arg, &beta, &alpha);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
			
 
				 
			
 
				 	int worker_size = starpu_combined_worker_get_size();
			
 
				 
			
@@ -368,38 +386,43 @@ static void gemv_kernel_cpu(void *descr[], void *cl_arg)
 
				 	GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t gemv_kernel_model = {
			
 
				+static struct starpu_perfmodel gemv_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "gemv_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet gemv_kernel_cl = {
			
 
				+static struct starpu_codelet gemv_kernel_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = gemv_kernel_cpu,
			
 
				+	.cpu_funcs = {gemv_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = gemv_kernel_cuda,
			
 
				+	.cuda_funcs = {gemv_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &gemv_kernel_model
			
 
				 };
			
 
				 
			
 
				-void gemv_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle matrix,
			
 
				-		starpu_data_handle v2,
			
 
				+int gemv_kernel(starpu_data_handle_t v1,
			
 
				+		starpu_data_handle_t matrix,
			
 
				+		starpu_data_handle_t v2,
			
 
				 		TYPE p1, TYPE p2,
			
 
				 		unsigned nblocks,
			
 
				 		int use_reduction)
			
 
				 {
			
 
				 	unsigned b1, b2;
			
 
				+	int ret;
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
 
				 	{
			
 
				-		starpu_insert_task(&scal_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&scal_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				 
			
 
				 	for (b2 = 0; b2 < nblocks; b2++)
			
@@ -407,15 +430,17 @@ void gemv_kernel(starpu_data_handle v1,
 
				 		for (b1 = 0; b1 < nblocks; b1++)
			
 
				 		{
			
 
				 			TYPE one = 1.0;
			
 
				-			starpu_insert_task(&gemv_kernel_cl,
			
 
				-				use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				-				STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				-				STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				-				STARPU_VALUE,	&one,	sizeof(one),
			
 
				-				STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				-				0);
			
 
				+			ret = starpu_insert_task(&gemv_kernel_cl,
			
 
				+						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
			
 
				+						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
			
 
				+						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
			
 
				+						 STARPU_VALUE,	&one,	sizeof(one),
			
 
				+						 STARPU_VALUE,	&p2,	sizeof(p2),
			
 
				+						 0);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 		}
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -425,7 +450,7 @@ void gemv_kernel(starpu_data_handle v1,
 
				 static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1, p2;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -445,7 +470,7 @@ static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1, p2;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1, &p2);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1, &p2);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -460,35 +485,41 @@ static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(nx, p2, v2, 1, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t scal_axpy_kernel_model = {
			
 
				+static struct starpu_perfmodel scal_axpy_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "scal_axpy_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet scal_axpy_kernel_cl = {
			
 
				+static struct starpu_codelet scal_axpy_kernel_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = scal_axpy_kernel_cpu,
			
 
				+	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = scal_axpy_kernel_cuda,
			
 
				+	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &scal_axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
			
 
				-			starpu_data_handle v2, TYPE p2,
			
 
				-			unsigned nblocks)
			
 
				+int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
			
 
				+		     starpu_data_handle_t v2, TYPE p2,
			
 
				+		     unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&scal_axpy_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			STARPU_VALUE, &p2, sizeof(p2),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&scal_axpy_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 STARPU_VALUE, &p2, sizeof(p2),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 
			
@@ -499,7 +530,7 @@ void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
 
				 static void axpy_kernel_cuda(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -516,7 +547,7 @@ static void axpy_kernel_cuda(void *descr[], void *cl_arg)
 
				 static void axpy_kernel_cpu(void *descr[], void *cl_arg)
			
 
				 {
			
 
				 	TYPE p1;
			
 
				-	starpu_unpack_cl_args(cl_arg, &p1);
			
 
				+	starpu_codelet_unpack_args(cl_arg, &p1);
			
 
				 
			
 
				 	TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
@@ -528,89 +559,46 @@ static void axpy_kernel_cpu(void *descr[], void *cl_arg)
 
				 	AXPY(nx, p1, v2, 1, v1, 1);
			
 
				 }
			
 
				 
			
 
				-static struct starpu_perfmodel_t axpy_kernel_model = {
			
 
				+static struct starpu_perfmodel axpy_kernel_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "axpy_kernel"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet axpy_kernel_cl = {
			
 
				+static struct starpu_codelet axpy_kernel_cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = axpy_kernel_cpu,
			
 
				+	.cpu_funcs = {axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = axpy_kernel_cuda,
			
 
				+	.cuda_funcs = {axpy_kernel_cuda, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &axpy_kernel_model
			
 
				 };
			
 
				 
			
 
				-void axpy_kernel(starpu_data_handle v1,
			
 
				-		starpu_data_handle v2, TYPE p1,
			
 
				+int axpy_kernel(starpu_data_handle_t v1,
			
 
				+		starpu_data_handle_t v2, TYPE p1,
			
 
				 		unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				 	{
			
 
				-		starpu_insert_task(&axpy_kernel_cl,
			
 
				-			STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				-			STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				-			STARPU_VALUE, &p1, sizeof(p1),
			
 
				-			0);
			
 
				+		ret = starpu_insert_task(&axpy_kernel_cl,
			
 
				+					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
			
 
				+					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
			
 
				+					 STARPU_VALUE, &p1, sizeof(p1),
			
 
				+					 0);
			
 
				+		if (ret == -ENODEV) return ret;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-/*
			
 
				- *	COPY kernel : vector_dst <- vector_src
			
 
				- */
			
 
				-
			
 
				-static void copy_handle_cpu(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	
			
 
				-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				-
			
 
				-	memcpy(dst, src, nx*elemsize);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static void copy_handle_cuda(void *descr[], void *cl_arg)
			
 
				-{
			
 
				-	TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				-	TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				-	
			
 
				-	unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				-	size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
			
 
				-
			
 
				-	cudaMemcpyAsync(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static struct starpu_perfmodel_t copy_handle_model = {
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "copy_handle"
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet copy_handle_cl = {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = copy_handle_cpu,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = copy_handle_cuda,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &copy_handle_model
			
 
				-};
			
 
				-
			
 
				-void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
			
 
				+int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
			
 
				 {
			
 
				 	unsigned b;
			
 
				 	for (b = 0; b < nblocks; b++)
			
 
				-	{
			
 
				-		starpu_insert_task(&copy_handle_cl,
			
 
				-			STARPU_W, starpu_data_get_sub_data(dst, 1, b),
			
 
				-			STARPU_R, starpu_data_get_sub_data(src, 1, b),
			
 
				-			0);
			
 
				-	}
			
 
				-} 
			
 
				+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
			
 
				+	return 0;
			
 
				+}
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -30,6 +30,7 @@
 
				 
			
 
				 #include <common/blas.h>
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_bound.h>
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				 #define NMAXBLOCKS	32
			
@@ -61,6 +62,7 @@ static unsigned nbigblocks = 8;
 
				 static unsigned pinned = 0;
			
 
				 static unsigned noprio = 0;
			
 
				 static unsigned check = 0;
			
 
				+static unsigned bound = 0;
			
 
				 static unsigned with_ctxs = 0;
			
 
				 static unsigned with_noctxs = 0;
			
 
				 static unsigned chole1 = 0;
			
@@ -76,64 +78,81 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args);
 
				 void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				 #endif
			
 
				 
			
 
				-extern struct starpu_perfmodel_t chol_model_11;
			
 
				-extern struct starpu_perfmodel_t chol_model_21;
			
 
				-extern struct starpu_perfmodel_t chol_model_22;
			
 
				+extern struct starpu_perfmodel chol_model_11;
			
 
				+extern struct starpu_perfmodel chol_model_21;
			
 
				+extern struct starpu_perfmodel chol_model_22;
			
 
				 
			
 
				 static void __attribute__((unused)) parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-with_ctxs") == 0) {
			
 
				+	for (i = 1; i < argc; i++)
			
 
				+	{
			
 
				+		if (strcmp(argv[i], "-with_ctxs") == 0) 
			
 
				+		{
			
 
				 			with_ctxs = 1;
			
 
				 			break;
			
 
				 		}
			
 
				-		if (strcmp(argv[i], "-with_noctxs") == 0) {
			
 
				+		if (strcmp(argv[i], "-with_noctxs") == 0) 
			
 
				+		{
			
 
				 			with_noctxs = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 		
			
 
				-		if (strcmp(argv[i], "-chole1") == 0) {
			
 
				+		if (strcmp(argv[i], "-chole1") == 0) 
			
 
				+		{
			
 
				 			chole1 = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-chole2") == 0) {
			
 
				+		if (strcmp(argv[i], "-chole2") == 0) 
			
 
				+		{
			
 
				 			chole2 = 1;
			
 
				 			break;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-			char *argptr;
			
 
				+		if (strcmp(argv[i], "-size") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			size = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nblocks") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				-			char *argptr;
			
 
				+
			
 
				+		if (strcmp(argv[i], "-nbigblocks") == 0)
			
 
				+		{
			
 
				+		        char *argptr;
			
 
				 			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-pin") == 0)
			
 
				+		{
			
 
				 			pinned = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-no-prio") == 0)
			
 
				+		{
			
 
				 			noprio = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-bound") == 0)
			
 
				+		{
			
 
				+			bound = 1;
			
 
				+		}
			
 
				+
			
 
				+		if (strcmp(argv[i], "-check") == 0)
			
 
				+		{
			
 
				 			check = 1;
			
 
				 		}
			
 
				-		
			
 
				-		if (strcmp(argv[i], "-h") == 0) {
			
 
				+
			
 
				+		if (strcmp(argv[i], "-h") == 0)
			
 
				+		{
			
 
				 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				-		}	
			
 
				-	}	
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 #endif /* __DW_CHOLESKY_H__ */
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,91 +36,100 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k, unsigned reclevel)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
			
 
				-	
			
 
				+
			
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
			
 
				+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
			
 
				 
			
 
				-	task->cl = &cl21;	
			
 
				+	task->cl = &cl21;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-	if (j == k+1) {
			
 
				+	if (j == k+1)
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
			
@@ -128,44 +137,47 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
 
				  */
			
 
				 
			
 
				 static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	/* create a new codelet */
			
 
				 	struct starpu_task *entry_task = NULL;
			
 
				 
			
 
				 	/* create all the DAG nodes */
			
 
				 	unsigned i,j,k;
			
 
				 
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
@@ -173,12 +185,14 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -189,13 +203,16 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				-			starpu_task_submit(task);
			
 
				+		else
			
 
				+		{
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				-		
			
 
				+
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				 			create_task_21(dataA, k, j, reclevel);
			
@@ -209,7 +226,7 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 	}
			
 
				 
			
 
				 	/* schedule the codelet */
			
 
				-	int ret = starpu_task_submit(entry_task);
			
 
				+	ret = starpu_task_submit(entry_task);
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				 		FPRINTF(stderr, "No worker may execute this task\n");
			
@@ -223,7 +240,8 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 		starpu_data_unpartition(dataA, 0);
			
 
				 		return;
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		STARPU_ASSERT(reclevel == 0);
			
 
				 		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
			
 
				 
			
@@ -253,20 +271,26 @@ static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		exit(77);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				 		starpu_malloc((void **)A, dim*dim*sizeof(float));
			
 
				-	} 
			
 
				-	else {
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
			
 
				+void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned pinned)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -284,6 +308,15 @@ void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
				 	double flop = (1.0f*size*size*size)/3.0f;
			
 
				 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_free(matA);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		free(matA);
			
 
				+	}
			
 
				+
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
@@ -321,10 +354,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
@@ -333,7 +368,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 
			
 
				-	cholesky_grain(mat, size, size, nblocks, nbigblocks);
			
 
				+	cholesky_grain(mat, size, size, nblocks, nbigblocks, pinned);
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				 	FPRINTF(stdout, "Results :\n");
			
@@ -342,10 +377,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -357,7 +394,7 @@ int main(int argc, char **argv)
 
				 	float *test_mat = malloc(size*size*sizeof(float));
			
 
				 	STARPU_ASSERT(test_mat);
			
 
				 
			
 
				-	SSYRK("L", "N", size, size, 1.0f, 
			
 
				+	SSYRK("L", "N", size, size, 1.0f,
			
 
				 				mat, size, 0.0f, test_mat, size);
			
 
				 
			
 
				 	FPRINTF(stderr, "comparing results ...\n");
			
@@ -365,15 +402,18 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				                                 FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				+	free(test_mat);
			
 
				 #endif
			
 
				 
			
 
				 	return 0;
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,45 +17,48 @@
 
				  */
			
 
				 
			
 
				 #include "cholesky.h"
			
 
				-#include "../sched_ctx_utils/sched_ctx_utils.h"
			
 
				+
			
 
				 /*
			
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				+	.modes = {STARPU_RW},
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
@@ -69,8 +72,9 @@ static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
 
				 	cl22.type = STARPU_SPMD;
			
 
				 }
			
 
				 
			
 
				-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
			
 
				 {
			
 
				+	int ret;
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
@@ -80,46 +84,53 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 	gettimeofday(&start, NULL);
			
 
				 
			
 
				+	if (bound)
			
 
				+		starpu_bound_start(0, 0);
			
 
				 	/* create all the DAG nodes */
			
 
				 	for (k = 0; k < nblocks; k++)
			
 
				 	{
			
 
				-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+                starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				-                starpu_insert_task(&cl11,
			
 
				-                                   STARPU_PRIORITY, prio_level,
			
 
				-                                   STARPU_RW, sdatakk,
			
 
				-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				-                                   0);
			
 
				+                ret = starpu_insert_task(&cl11,
			
 
				+					 STARPU_PRIORITY, prio_level,
			
 
				+					 STARPU_RW, sdatakk,
			
 
				+					 STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				+					 0);
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 		for (j = k+1; j<nblocks; j++)
			
 
				 		{
			
 
				-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-                        starpu_insert_task(&cl21,
			
 
				-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                           STARPU_R, sdatakk,
			
 
				-                                           STARPU_RW, sdatakj,
			
 
				-                                           0);
			
 
				+                        ret = starpu_insert_task(&cl21,
			
 
				+						 STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+						 STARPU_R, sdatakk,
			
 
				+						 STARPU_RW, sdatakj,
			
 
				+						 0);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				 
			
 
				 			for (i = k+1; i<nblocks; i++)
			
 
				 			{
			
 
				 				if (i <= j)
			
 
				                                 {
			
 
				-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				-					
			
 
				-					starpu_insert_task(&cl22,
			
 
				-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                                           STARPU_R, sdataki,
			
 
				-                                                           STARPU_R, sdatakj,
			
 
				-                                                           STARPU_RW, sdataij,
			
 
				-                                                           0);
			
 
				+					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+					ret = starpu_insert_task(&cl22,
			
 
				+								 STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				+								 STARPU_R, sdataki,
			
 
				+								 STARPU_R, sdatakj,
			
 
				+								 STARPU_RW, sdataij,
			
 
				+								 0);
			
 
				+					STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				                                 }
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				+	if (bound)
			
 
				+		starpu_bound_stop();
			
 
				 
			
 
				 	starpu_data_unpartition(dataA, 0);
			
 
				 
			
@@ -139,23 +150,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 		FPRINTF(stdout, "%2.2f\n", timing/1000);
			
 
				 	
			
 
				 		FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				+		if (bound)
			
 
				+		{
			
 
				+			double res;
			
 
				+			starpu_bound_compute(&res, NULL, 0);
			
 
				+			FPRINTF(stderr, "Theoretical GFlops: %2.2f\n", (flop/res/1000000.0f));
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
 
				 	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -163,6 +182,8 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				 
			
 
				 	_cholesky(dataA, nblocks);
			
 
				+
			
 
				+	starpu_data_unregister(dataA);
			
 
				 }
			
 
				 
			
 
				 static void execute_cholesky(unsigned size, unsigned nblocks)
			
@@ -188,16 +209,19 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				 	}
			
 
				 #endif
			
 
				+
			
 
				 	cholesky(mat, size, size, nblocks);
			
 
				 
			
 
				 #ifdef PRINT_OUTPUT
			
@@ -206,10 +230,12 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -225,7 +251,8 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i > j) {
			
 
				+				if (i > j)
			
 
				+				{
			
 
				 					mat[j+i*size] = 0.0f; /* debug */
			
 
				 				}
			
 
				 			}
			
@@ -242,10 +269,12 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i <= j) {
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				 					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 				}
			
 
				-				else {
			
 
				+				else
			
 
				+				{
			
 
				 					FPRINTF(stdout, ".\t");
			
 
				 				}
			
 
				 			}
			
@@ -257,17 +286,21 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
				 		{
			
 
				 			for (i = 0; i < size; i++)
			
 
				 			{
			
 
				-				if (i <= j) {
			
 
				+				if (i <= j)
			
 
				+				{
			
 
				 	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				 	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				-	                                if (err > 0.00001) {
			
 
				+	                                if (err > 0.00001)
			
 
				+					{
			
 
				 	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				 	                                        assert(0);
			
 
				 	                                }
			
 
				 	                        }
			
 
				 			}
			
 
				 	        }
			
 
				+		free(test_mat);
			
 
				 	}
			
 
				+	starpu_free(mat);
			
 
				 
			
 
				 }
			
 
				 
			
--- a/examples/cholesky/cholesky_kernels.c
+++ b/examples/cholesky/cholesky_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -55,7 +55,8 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 
				 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				 				right, ld12, 1.0f, center, ld22);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			/* Parallel CPU kernel */
			
 
				 			int rank = starpu_combined_worker_get_rank();
			
 
				 
			
@@ -113,7 +114,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 
				 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				 			break;
			
@@ -157,7 +159,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 
			
 
				 	unsigned z;
			
 
				 
			
 
				-	switch (s) {
			
 
				+	switch (s)
			
 
				+	{
			
 
				 		case 0:
			
 
				 
			
 
				 			/*
			
@@ -188,7 +191,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 
				 			int ret;
			
 
				 			int info;
			
 
				 			ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
			
 
				-			if (ret != MAGMA_SUCCESS) {
			
 
				+			if (ret != MAGMA_SUCCESS)
			
 
				+			{
			
 
				 				fprintf(stderr, "Error in Magma: %d\n", ret);
			
 
				 				STARPU_ABORT();
			
 
				 			}
			
--- a/examples/cholesky/cholesky_models.c
+++ b/examples/cholesky/cholesky_models.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -17,8 +17,8 @@
 
				  */
			
 
				 
			
 
				 /*
			
 
				- * As a convention, in that file, descr[0] is represented by A,
			
 
				- * 				  descr[1] is B ...
			
 
				+ * As a convention, in that file, buffers[0] is represented by A,
			
 
				+ * 				  buffers[1] is B ...
			
 
				  */
			
 
				 
			
 
				 /*
			
@@ -26,6 +26,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include "cholesky.h"
			
 
				 
			
 
				 /* #define USE_PERTURBATION	1 */
			
 
				 
			
@@ -35,11 +36,11 @@
 
				 #define PERTURBATE(a)	(a)
			
 
				 #endif
			
 
				 
			
 
				-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				 
			
@@ -50,11 +51,11 @@ static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_11_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				 
			
@@ -65,11 +66,11 @@ static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				 
			
@@ -80,11 +81,11 @@ static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_21_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				 
			
@@ -95,11 +96,11 @@ static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+static double cpu_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				 
			
@@ -110,11 +111,11 @@ static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				+static double cuda_chol_task_22_cost(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl)
			
 
				 {
			
 
				 	uint32_t n;
			
 
				 
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	n = starpu_matrix_get_nx(task->handles[0]);
			
 
				 
			
 
				 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				 
			
@@ -125,28 +126,34 @@ static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
 
				 	return PERTURBATE(cost);
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_11 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				+struct starpu_perfmodel chol_model_11 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_11_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_11_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_11"
			
 
				 };
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_21 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				+struct starpu_perfmodel chol_model_21 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_21_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_21_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_21"
			
 
				 };
			
 
				 
			
 
				-struct starpu_perfmodel_t chol_model_22 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				+struct starpu_perfmodel chol_model_22 =
			
 
				+{
			
 
				+	.per_arch =
			
 
				+	{
			
 
				+		[STARPU_CPU_DEFAULT][0] = { .cost_function = cpu_chol_task_22_cost },
			
 
				+		[STARPU_CUDA_DEFAULT][0] = { .cost_function = cuda_chol_task_22_cost }
			
 
				 	},
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "chol_model_22"
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -36,18 +36,19 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 1,
			
 
				 	.model = &chol_model_11
			
 
				 };
			
 
				 
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				+static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned k)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
			
 
				 
			
@@ -56,76 +57,80 @@ static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	if (!noprio)
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 2,
			
 
				 	.model = &chol_model_21
			
 
				 };
			
 
				 
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
			
 
				+static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
			
 
				 {
			
 
				 	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				 
			
 
				 	task->cl = &cl21;	
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				 
			
 
				-	if (!noprio && (j == k+1)) {
			
 
				+	if (!noprio && (j == k+1))
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
 
				 
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 	.nbuffers = 3,
			
 
				 	.model = &chol_model_22
			
 
				 };
			
 
				 
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
			
 
				+static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
@@ -134,27 +139,28 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if (!noprio && (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				+
			
 
				+	if (!noprio && (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				 	int ret = starpu_task_submit(task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
@@ -167,7 +173,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 
				  *	and construct the DAG
			
 
				  */
			
 
				 
			
 
				-static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				+static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -183,12 +189,15 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(dataA, k);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			int ret = starpu_task_submit(task);
			
 
				-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+                        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+			{
			
 
				                                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                                 exit(0);
			
 
				                         }
			
@@ -209,7 +218,8 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 
			
 
				 	/* schedule the codelet */
			
 
				 	int ret = starpu_task_submit(entry_task);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				+        if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				                 FPRINTF(stderr, "No worker may execute this task\n");
			
 
				                 exit(0);
			
 
				         }
			
@@ -233,24 +243,31 @@ static void _cholesky(starpu_data_handle dataA, unsigned nblocks)
 
				 	FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				 }
			
 
				 
			
 
				-static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				+static int initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				 {
			
 
				-	starpu_init(NULL);
			
 
				-	
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				 	starpu_helper_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
 
				 		starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				 	} 
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		*A = malloc(dim*dim*sizeof(float));
			
 
				 	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				+static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned pinned)
			
 
				 {
			
 
				-	starpu_data_handle dataA;
			
 
				+	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
 
				 	 * one block is now determined by 2 unsigned (i,j) */
			
@@ -258,12 +275,14 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				 
			
 
				-	struct starpu_data_filter f = {
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				 		.filter_func = starpu_vertical_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				-	struct starpu_data_filter f2 = {
			
 
				+	struct starpu_data_filter f2 =
			
 
				+	{
			
 
				 		.filter_func = starpu_block_filter_func,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
@@ -272,6 +291,17 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	_cholesky(dataA, nblocks);
			
 
				 
			
 
				+	starpu_data_unregister(dataA);
			
 
				+
			
 
				+	if (pinned)
			
 
				+	{
			
 
				+		starpu_free(matA);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		free(matA);
			
 
				+	}
			
 
				+
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
@@ -289,7 +319,9 @@ int main(int argc, char **argv)
 
				 	float *mat;
			
 
				 
			
 
				 	mat = malloc(size*size*sizeof(float));
			
 
				-	initialize_system(&mat, size, pinned);
			
 
				+	int ret = initialize_system(&mat, size, pinned);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				 	unsigned i,j;
			
 
				 	for (i = 0; i < size; i++)
			
@@ -309,10 +341,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
@@ -321,7 +355,7 @@ int main(int argc, char **argv)
 
				 #endif
			
 
				 
			
 
				 
			
 
				-	cholesky(mat, size, size, nblocks);
			
 
				+	cholesky(mat, size, size, nblocks, pinned);
			
 
				 
			
 
				 #ifdef CHECK_OUTPUT
			
 
				 	FPRINTF(stdout, "Results :\n");
			
@@ -330,10 +364,12 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 				mat[j+i*size] = 0.0f; /* debug */
			
 
				 			}
			
@@ -353,14 +389,17 @@ int main(int argc, char **argv)
 
				 	{
			
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				-			if (i <= j) {
			
 
				+			if (i <= j)
			
 
				+			{
			
 
				 				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				FPRINTF(stdout, ".\t");
			
 
				 			}
			
 
				 		}
			
 
				 		FPRINTF(stdout, "\n");
			
 
				+		free(test_mat);
			
 
				 	}
			
 
				 #endif
			
 
				 
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,7 +19,7 @@
 
				 
			
 
				 /* A [ y ] [ x ] */
			
 
				 float *A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				-starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				+starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				 
			
 
				 /*
			
 
				  *	Some useful functions
			
@@ -39,12 +39,13 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				  *	Create the codelets
			
 
				  */
			
 
				 
			
 
				-static starpu_codelet cl11 =
			
 
				+static struct starpu_codelet cl11 =
			
 
				 {
			
 
				+	.modes = { STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_POTRF
			
@@ -66,26 +67,27 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 
				 	task->cl = &cl11;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[k][k];
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				+	task->handles[0] = A_state[k][k];
			
 
				 
			
 
				 	/* this is an important task */
			
 
				 	task->priority = STARPU_MAX_PRIO;
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				 	}
			
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl21 =
			
 
				+static struct starpu_codelet cl21 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_STRSM
			
@@ -100,37 +102,42 @@ static starpu_codelet cl21 =
 
				 
			
 
				 static void create_task_21(unsigned k, unsigned j)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				 
			
 
				 	task->cl = &cl21;	
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[k][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				+	task->handles[0] = A_state[k][k];
			
 
				+	task->handles[1] = A_state[j][k];
			
 
				 
			
 
				-	if (j == k+1) {
			
 
				+	if (j == k+1)
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl22 =
			
 
				+static struct starpu_codelet cl22 =
			
 
				 {
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_GORDON
			
 
				 #ifdef SPU_FUNC_SGEMM
			
@@ -145,6 +152,8 @@ static starpu_codelet cl22 =
 
				 
			
 
				 static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 /*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
			
 
				 
			
 
				 	struct starpu_task *task = create_task(TAG22(k, i, j));
			
@@ -152,26 +161,27 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 	task->cl = &cl22;
			
 
				 
			
 
				 	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = A_state[i][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = A_state[j][i]; 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				+	task->handles[0] = A_state[i][k];
			
 
				+	task->handles[1] = A_state[j][k];
			
 
				+	task->handles[2] = A_state[j][i];
			
 
				+
			
 
				+	if ( (i == k + 1) && (j == k +1) )
			
 
				+	{
			
 
				 		task->priority = STARPU_MAX_PRIO;
			
 
				 	}
			
 
				 
			
 
				 	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				+	if (k > 0)
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_submit(task);
			
 
				+	ret = starpu_task_submit(task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 }
			
 
				 
			
 
				 
			
@@ -183,6 +193,8 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 
			
 
				 static void cholesky_no_stride(void)
			
 
				 {
			
 
				+	int ret;
			
 
				+
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
 
				 
			
@@ -195,11 +207,14 @@ static void cholesky_no_stride(void)
 
				 	{
			
 
				 		struct starpu_task *task = create_task_11(k, nblocks);
			
 
				 		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				+		if (k == 0)
			
 
				+		{
			
 
				 			entry_task = task;
			
 
				 		}
			
 
				-		else {
			
 
				-			starpu_task_submit(task);
			
 
				+		else
			
 
				+		{
			
 
				+			ret = starpu_task_submit(task);
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 		
			
 
				 		for (j = k+1; j<nblocks; j++)
			
@@ -216,7 +231,8 @@ static void cholesky_no_stride(void)
 
				 
			
 
				 	/* schedule the codelet */
			
 
				 	gettimeofday(&start, NULL);
			
 
				-	starpu_task_submit(entry_task);
			
 
				+	ret = starpu_task_submit(entry_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				 	/* stall the application until the end of computations */
			
 
				 	starpu_tag_wait(TAG11(nblocks-1));
			
@@ -235,13 +251,17 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	unsigned x, y;
			
 
				 	unsigned i, j;
			
 
				+	int ret;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 	assert(nblocks <= NMAXBLOCKS);
			
 
				 
			
 
				 	FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				 
			
 
				-	starpu_init(NULL);
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	/* Disable sequential consistency */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
@@ -251,17 +271,8 @@ int main(int argc, char **argv)
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				 	{
			
 
				-		if (x <= y) {
			
 
				-			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-			assert(A[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	{
			
 
				-		if (x <= y) {
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				 			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				 #else
			
@@ -277,7 +288,8 @@ int main(int argc, char **argv)
 
				 	 * */
			
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				-	if (x <= y) {
			
 
				+	if (x <= y)
			
 
				+	{
			
 
				 		for (i = 0; i < BLOCKSIZE; i++)
			
 
				 		for (j = 0; j < BLOCKSIZE; j++)
			
 
				 		{
			
@@ -290,12 +302,11 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-
			
 
				-
			
 
				 	for (y = 0; y < nblocks; y++)
			
 
				 	for (x = 0; x < nblocks; x++)
			
 
				 	{
			
 
				-		if (x <= y) {
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				 			starpu_matrix_data_register(&A_state[y][x], 0, (uintptr_t)A[y][x], 
			
 
				 				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
			
 
				 		}
			
@@ -303,6 +314,16 @@ int main(int argc, char **argv)
 
				 
			
 
				 	cholesky_no_stride();
			
 
				 
			
 
				+	for (y = 0; y < nblocks; y++)
			
 
				+	for (x = 0; x < nblocks; x++)
			
 
				+	{
			
 
				+		if (x <= y)
			
 
				+		{
			
 
				+			starpu_data_unregister(A_state[y][x]);
			
 
				+			free(A[y][x]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	starpu_helper_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
--- a/examples/cholesky_2ctxs/cholesky/.dirstamp
+++ b/examples/cholesky_2ctxs/cholesky/.dirstamp
--- a/examples/cholesky_2ctxs/cholesky/cholesky.h
+++ b/examples/cholesky_2ctxs/cholesky/cholesky.h
@@ -1,154 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __DW_CHOLESKY_H__
			
 
				-#define __DW_CHOLESKY_H__
			
 
				-
			
 
				-#include <limits.h>
			
 
				-#include <string.h>
			
 
				-#include <math.h>
			
 
				-#include <sys/time.h>
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <cuda.h>
			
 
				-#include <cuda_runtime.h>
			
 
				-#include <cublas.h>
			
 
				-#endif
			
 
				-
			
 
				-#include <common/blas.h>
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#define NMAXBLOCKS	32
			
 
				-
			
 
				-#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
			
 
				-#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
			
 
				-					| (unsigned long long)(j))))
			
 
				-#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
			
 
				-					| ((unsigned long long)(i)<<16)	\
			
 
				-					| (unsigned long long)(j))))
			
 
				-
			
 
				-#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
			
 
				-#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
			
 
				-					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
			
 
				-					| (unsigned long long)(j))))
			
 
				-#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
			
 
				-					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
			
 
				-					| ((unsigned long long)(i)<<16) 			\
			
 
				-					| (unsigned long long)(j))))
			
 
				-
			
 
				-#define BLOCKSIZE	(size/nblocks)
			
 
				-
			
 
				-#define BLAS3_FLOP(n1,n2,n3)    \
			
 
				-        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
			
 
				-
			
 
				-//static unsigned size = 4*1024;
			
 
				-//static unsigned nblocks = 16;
			
 
				-static unsigned nbigblocks = 8;
			
 
				-static unsigned pinned = 0;
			
 
				-static unsigned noprio = 0;
			
 
				-static unsigned check = 0;
			
 
				-
			
 
				-void chol_cpu_codelet_update_u11(void **, void *);
			
 
				-void chol_cpu_codelet_update_u21(void **, void *);
			
 
				-void chol_cpu_codelet_update_u22(void **, void *);
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void chol_cublas_codelet_update_u11(void *descr[], void *_args);
			
 
				-void chol_cublas_codelet_update_u21(void *descr[], void *_args);
			
 
				-void chol_cublas_codelet_update_u22(void *descr[], void *_args);
			
 
				-#endif
			
 
				-
			
 
				-double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier);
			
 
				-
			
 
				-extern struct starpu_perfmodel_t chol_model_11;
			
 
				-extern struct starpu_perfmodel_t chol_model_21;
			
 
				-extern struct starpu_perfmodel_t chol_model_22;
			
 
				-
			
 
				-static void __attribute__((unused)) parse_args(int argc, char **argv, unsigned *size, unsigned *nblocks)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-		        char *argptr;
			
 
				-			(*size) = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				-			(*nblocks) = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				-			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				-			pinned = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				-			noprio = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				-			check = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-h") == 0) {
			
 
				-			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void __attribute__((unused)) parse_args_ctx(int start, int argc, char **argv, unsigned *size, unsigned *nblocks)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = start; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-		        char *argptr;
			
 
				-			(*size) = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				-			(*nblocks) = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nbigblocks") == 0) {
			
 
				-		        char *argptr;
			
 
				-			nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-pin") == 0) {
			
 
				-			pinned = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-no-prio") == 0) {
			
 
				-			noprio = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				-			check = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-h") == 0) {
			
 
				-			printf("usage : %s [-pin] [-size size] [-nblocks nblocks] [-check]\n", argv[0]);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-#endif // __DW_CHOLESKY_H__
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_grain_tag.c
@@ -1,382 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky.h"
			
 
				-
			
 
				-/*
			
 
				- *	Some useful functions
			
 
				- */
			
 
				-
			
 
				-static struct starpu_task *create_task(starpu_tag_t id)
			
 
				-{
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-		task->cl_arg = NULL;
			
 
				-		task->use_tag = 1;
			
 
				-		task->tag_id = id;
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Create the codelets
			
 
				- */
			
 
				-
			
 
				-static starpu_codelet cl11 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
			
 
				-{
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
			
 
				-	
			
 
				-	task->cl = &cl11;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-
			
 
				-	/* this is an important task */
			
 
				-	task->priority = STARPU_MAX_PRIO;
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
			
 
				-	}
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl21 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &chol_model_21
			
 
				-};
			
 
				-
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
			
 
				-
			
 
				-	task->cl = &cl21;	
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				-
			
 
				-	if (j == k+1) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl22 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				-#endif
			
 
				-	.nbuffers = 3,
			
 
				-	.model = &chol_model_22
			
 
				-};
			
 
				-
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
			
 
				-
			
 
				-	task->cl = &cl22;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization 
			
 
				- *	and construct the DAG
			
 
				- */
			
 
				-
			
 
				-static void cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	/* create a new codelet */
			
 
				-	struct starpu_task *entry_task = NULL;
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	starpu_data_handle dataA;
			
 
				-
			
 
				-	/* monitor and partition the A matrix into blocks :
			
 
				-	 * one block is now determined by 2 unsigned (i,j) */
			
 
				-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				-
			
 
				-	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				-
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				-
			
 
				-	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				-
			
 
				-	for (k = 0; k < nbigblocks; k++)
			
 
				-	{
			
 
				-		struct starpu_task *task = create_task_11(dataA, k, reclevel);
			
 
				-		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				-			entry_task = task;
			
 
				-		}
			
 
				-		else {
			
 
				-		  starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-		}
			
 
				-		
			
 
				-		for (j = k+1; j<nblocks; j++)
			
 
				-		{
			
 
				-		  create_task_21(dataA, k, j, reclevel, sched_ctx);
			
 
				-
			
 
				-			for (i = k+1; i<nblocks; i++)
			
 
				-			{
			
 
				-				if (i <= j)
			
 
				-				  create_task_22(dataA, k, i, j, reclevel, sched_ctx);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* schedule the codelet */
			
 
				-	int ret = starpu_task_submit_to_ctx(entry_task, sched_ctx);
			
 
				-	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				-	{
			
 
				-		fprintf(stderr, "No worker may execute this task\n");
			
 
				-		exit(-1);
			
 
				-	}
			
 
				-
			
 
				-	if (nblocks == nbigblocks)
			
 
				-	{
			
 
				-		/* stall the application until the end of computations */
			
 
				-		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
			
 
				-		starpu_data_unpartition(dataA, 0);
			
 
				-		return;
			
 
				-	}
			
 
				-	else {
			
 
				-		STARPU_ASSERT(reclevel == 0);
			
 
				-		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
			
 
				-
			
 
				-		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
			
 
				-		STARPU_ASSERT(tag_array);
			
 
				-
			
 
				-		unsigned ind = 0;
			
 
				-		for (i = nbigblocks; i < nblocks; i++)
			
 
				-		for (j = nbigblocks; j < nblocks; j++)
			
 
				-		{
			
 
				-			if (i <= j)
			
 
				-				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
			
 
				-		}
			
 
				-
			
 
				-		starpu_tag_wait_array(ind, tag_array);
			
 
				-
			
 
				-		free(tag_array);
			
 
				-
			
 
				-		starpu_data_unpartition(dataA, 0);
			
 
				-		starpu_data_unregister(dataA);
			
 
				-
			
 
				-		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
			
 
				-
			
 
				-		cholesky_grain_rec(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1, sched_ctx);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				-{
			
 
				-  //	starpu_init(NULL);
			
 
				-
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				-	if (pinned)
			
 
				-	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
			
 
				-	} 
			
 
				-	else {
			
 
				-		*A = malloc(dim*dim*sizeof(float));
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	cholesky_grain_rec(matA, size, ld, nblocks, nbigblocks, 0, sched_ctx);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				-
			
 
				-	double flop = (1.0f*size*size*size)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				-
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-
			
 
				-	//	starpu_shutdown();
			
 
				-}
			
 
				-
			
 
				-int run_cholesky_grain_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
			
 
				-{
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				-	 * */
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	float *mat;
			
 
				-
			
 
				-	mat = malloc(size*size*sizeof(float));
			
 
				-	initialize_system(&mat, size, pinned);
			
 
				-
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < size; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-	cholesky_grain(mat, size, size, nblocks, nbigblocks, sched_ctx);
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-
			
 
				-	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				-	float *test_mat = malloc(size*size*sizeof(float));
			
 
				-	STARPU_ASSERT(test_mat);
			
 
				-
			
 
				-	SSYRK("L", "N", size, size, 1.0f, 
			
 
				-				mat, size, 0.0f, test_mat, size);
			
 
				-
			
 
				-	fprintf(stderr, "comparing results ...\n");
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", test_mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_implicit.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_implicit.c
@@ -1,286 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  INRIA
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky.h"
			
 
				-
			
 
				-/*
			
 
				- *	Create the codelets
			
 
				- */
			
 
				-
			
 
				-static starpu_codelet cl11 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet cl21 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &chol_model_21
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet cl22 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				-#endif
			
 
				-	.nbuffers = 3,
			
 
				-	.model = &chol_model_22
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization
			
 
				- *	and construct the DAG
			
 
				- */
			
 
				-
			
 
				-static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
			
 
				-{
			
 
				-	cl22.type = STARPU_SPMD;
			
 
				-}
			
 
				-
			
 
				-static double _cholesky(starpu_data_handle dataA, unsigned nblocks, double *timing)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				-	{
			
 
				-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-
			
 
				-		starpu_insert_task(&cl11,
			
 
				-				   STARPU_PRIORITY, prio_level,
			
 
				-				   STARPU_RW, sdatakk,
			
 
				-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				-				   0);
			
 
				-
			
 
				-		for (j = k+1; j<nblocks; j++)
			
 
				-		{
			
 
				-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				-			starpu_insert_task(&cl21,
			
 
				-					   STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-					   STARPU_R, sdatakk,
			
 
				-					   STARPU_RW, sdatakj,
			
 
				-					   0);
			
 
				-
			
 
				-			for (i = k+1; i<nblocks; i++)
			
 
				-			{
			
 
				-				if (i <= j)
			
 
				-                                {
			
 
				-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				-					
			
 
				-					starpu_insert_task(&cl22,
			
 
				-							   STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-							   STARPU_R, sdataki,
			
 
				-							   STARPU_R, sdatakj,
			
 
				-							   STARPU_RW, sdataij,
			
 
				-							   0);
			
 
				-                                }
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-		
			
 
				-	starpu_data_unpartition(dataA, 0);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	(*timing) = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-
			
 
				-	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				-
			
 
				-	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-
			
 
				-	double gflops = (flop/(*timing)/1000.0f);
			
 
				-	(*timing) /= 1000000.0f; //sec
			
 
				-	//	(*timing) /= 60.0f; //min
			
 
				-	return gflops;
			
 
				-}
			
 
				-
			
 
				-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, double *timing)
			
 
				-{
			
 
				-	starpu_data_handle dataA;
			
 
				-
			
 
				-	/* monitor and partition the A matrix into blocks :
			
 
				-	 * one block is now determined by 2 unsigned (i,j) */
			
 
				-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				-
			
 
				-	struct starpu_data_filter f = {
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				-		.nchildren = nblocks
			
 
				-	};
			
 
				-
			
 
				-	struct starpu_data_filter f2 = {
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				-		.nchildren = nblocks
			
 
				-	};
			
 
				-
			
 
				-	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				-	double gflops = _cholesky(dataA, nblocks, timing);
			
 
				-	starpu_data_unregister(dataA);
			
 
				-	return gflops;
			
 
				-}
			
 
				-
			
 
				-double run_cholesky_implicit(int start, int argc, char **argv, double *timing, pthread_barrier_t *barrier)
			
 
				-{
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				-	 * */
			
 
				-
			
 
				-	unsigned size = 4 * 1024;
			
 
				-	unsigned nblocks = 16;
			
 
				-	parse_args_ctx(start, argc, argv, &size, &nblocks);
			
 
				-
			
 
				-	//	starpu_init(NULL);
			
 
				-
			
 
				-	//	starpu_helper_cublas_init();
			
 
				-
			
 
				-	float *mat;
			
 
				-
			
 
				-	starpu_malloc((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				-
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < size; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-//#define PRINT_OUTPUT
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-	double gflops = cholesky(mat, size, size, nblocks, timing);
			
 
				-
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		fprintf(stderr, "compute explicit LLt ...\n");
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i > j) {
			
 
				-					mat[j+i*size] = 0.0f; // debug
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		float *test_mat = malloc(size*size*sizeof(float));
			
 
				-		STARPU_ASSERT(test_mat);
			
 
				-	
			
 
				-		SSYRK("L", "N", size, size, 1.0f,
			
 
				-					mat, size, 0.0f, test_mat, size);
			
 
				-	
			
 
				-		fprintf(stderr, "comparing results ...\n");
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i <= j) {
			
 
				-					printf("%2.2f\t", test_mat[j +i*size]);
			
 
				-				}
			
 
				-				else {
			
 
				-					printf(".\t");
			
 
				-				}
			
 
				-			}
			
 
				-			printf("\n");
			
 
				-		}
			
 
				-#endif
			
 
				-	
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i <= j) {
			
 
				-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				-	                                if (err > 0.00001) {
			
 
				-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				-	                                        assert(0);
			
 
				-	                                }
			
 
				-	                        }
			
 
				-			}
			
 
				-	        }
			
 
				-	}
			
 
				-	starpu_free((void *)mat);
			
 
				-	//	starpu_helper_cublas_shutdown();
			
 
				-	//	starpu_shutdown();
			
 
				-
			
 
				-	return gflops;
			
 
				-}
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_implicit_all_machine.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_implicit_all_machine.c
@@ -1,280 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky.h"
			
 
				-
			
 
				-/*
			
 
				- *	Create the codelets
			
 
				- */
			
 
				-
			
 
				-static starpu_codelet cl11 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet cl21 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &chol_model_21
			
 
				-};
			
 
				-
			
 
				-static starpu_codelet cl22 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.type = STARPU_SEQ,
			
 
				-	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				-#endif
			
 
				-	.nbuffers = 3,
			
 
				-	.model = &chol_model_22
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization
			
 
				- *	and construct the DAG
			
 
				- */
			
 
				-
			
 
				-static void callback_turn_spmd_on(void *arg __attribute__ ((unused)))
			
 
				-{
			
 
				-	cl22.type = STARPU_SPMD;
			
 
				-}
			
 
				-
			
 
				-static double _cholesky(starpu_data_handle dataA, unsigned nblocks)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	int prio_level = noprio?STARPU_DEFAULT_PRIO:STARPU_MAX_PRIO;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				-	{
			
 
				-                starpu_data_handle sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-
			
 
				-                starpu_insert_task(&cl11,
			
 
				-                                   STARPU_PRIORITY, prio_level,
			
 
				-                                   STARPU_RW, sdatakk,
			
 
				-				   STARPU_CALLBACK, (k == 3*nblocks/4)?callback_turn_spmd_on:NULL,
			
 
				-                                   0);
			
 
				-
			
 
				-		for (j = k+1; j<nblocks; j++)
			
 
				-		{
			
 
				-                        starpu_data_handle sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
			
 
				-
			
 
				-                        starpu_insert_task(&cl21,
			
 
				-                                           STARPU_PRIORITY, (j == k+1)?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                           STARPU_R, sdatakk,
			
 
				-                                           STARPU_RW, sdatakj,
			
 
				-                                           0);
			
 
				-
			
 
				-			for (i = k+1; i<nblocks; i++)
			
 
				-			{
			
 
				-				if (i <= j)
			
 
				-                                {
			
 
				-					starpu_data_handle sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
			
 
				-					starpu_data_handle sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
			
 
				-					
			
 
				-					starpu_insert_task(&cl22,
			
 
				-                                                           STARPU_PRIORITY, ((i == k+1) && (j == k+1))?prio_level:STARPU_DEFAULT_PRIO,
			
 
				-                                                           STARPU_R, sdataki,
			
 
				-                                                           STARPU_R, sdatakj,
			
 
				-                                                           STARPU_RW, sdataij,
			
 
				-                                                           0);
			
 
				-                                }
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	starpu_data_unpartition(dataA, 0);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	unsigned long n = starpu_matrix_get_nx(dataA);
			
 
				-
			
 
				-	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-	return (flop/timing/1000.0f);
			
 
				-}
			
 
				-
			
 
				-static double cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
			
 
				-{
			
 
				-	starpu_data_handle dataA;
			
 
				-
			
 
				-	/* monitor and partition the A matrix into blocks :
			
 
				-	 * one block is now determined by 2 unsigned (i,j) */
			
 
				-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				-
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				-
			
 
				-	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				-
			
 
				-	return _cholesky(dataA, nblocks);
			
 
				-}
			
 
				-
			
 
				-double run_cholesky_implicit_all_machine(int argc, char **argv)
			
 
				-{
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				-	 * */
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	//	starpu_init(NULL);
			
 
				-
			
 
				-	//	starpu_helper_cublas_init();
			
 
				-
			
 
				-	float *mat;
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&mat, (size_t)size*size*sizeof(float));
			
 
				-
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < size; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-//#define PRINT_OUTPUT
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	double gflops = cholesky(mat, size, size, nblocks);
			
 
				-
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		fprintf(stderr, "compute explicit LLt ...\n");
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i > j) {
			
 
				-					mat[j+i*size] = 0.0f; // debug
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		float *test_mat = malloc(size*size*sizeof(float));
			
 
				-		STARPU_ASSERT(test_mat);
			
 
				-	
			
 
				-		SSYRK("L", "N", size, size, 1.0f,
			
 
				-					mat, size, 0.0f, test_mat, size);
			
 
				-	
			
 
				-		fprintf(stderr, "comparing results ...\n");
			
 
				-#ifdef PRINT_OUTPUT
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i <= j) {
			
 
				-					printf("%2.2f\t", test_mat[j +i*size]);
			
 
				-				}
			
 
				-				else {
			
 
				-					printf(".\t");
			
 
				-				}
			
 
				-			}
			
 
				-			printf("\n");
			
 
				-		}
			
 
				-#endif
			
 
				-	
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < size; i++)
			
 
				-			{
			
 
				-				if (i <= j) {
			
 
				-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-	                                float err = abs(test_mat[j +i*size] - orig);
			
 
				-	                                if (err > 0.00001) {
			
 
				-	                                        fprintf(stderr, "Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", i, j, test_mat[j +i*size], orig, err);
			
 
				-	                                        assert(0);
			
 
				-	                                }
			
 
				-	                        }
			
 
				-			}
			
 
				-	        }
			
 
				-	}
			
 
				-
			
 
				-	//	starpu_helper_cublas_shutdown();
			
 
				-	//	starpu_shutdown();
			
 
				-
			
 
				-	return gflops;
			
 
				-}
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_kernels.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_kernels.c
@@ -1,230 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu_config.h>
			
 
				-#include "cholesky.h"
			
 
				-#include "../../common/blas.h"
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-#include <starpu_cuda.h>
			
 
				-#endif
			
 
				-
			
 
				-/*
			
 
				- *   U22 
			
 
				- */
			
 
				-
			
 
				-static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				-{
			
 
				-	//printf("22\n");
			
 
				-	float *left 	= (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	float *right 	= (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-	float *center 	= (float *)STARPU_MATRIX_GET_PTR(descr[2]);
			
 
				-
			
 
				-	unsigned dx = STARPU_MATRIX_GET_NY(descr[2]);
			
 
				-	unsigned dy = STARPU_MATRIX_GET_NX(descr[2]);
			
 
				-	unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-
			
 
				-	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				-	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
			
 
				-
			
 
				-	if (s == 0)
			
 
				-	{
			
 
				-		int worker_size = starpu_combined_worker_get_size();
			
 
				-
			
 
				-		if (worker_size == 1)
			
 
				-		{
			
 
				-			/* Sequential CPU kernel */
			
 
				-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
			
 
				-				right, ld12, 1.0f, center, ld22);
			
 
				-		}
			
 
				-		else {
			
 
				-			/* Parallel CPU kernel */
			
 
				-			int rank = starpu_combined_worker_get_rank();
			
 
				-
			
 
				-			int block_size = (dx + worker_size - 1)/worker_size;
			
 
				-			int new_dx = STARPU_MIN(dx, block_size*(rank+1)) - block_size*rank;
			
 
				-			
			
 
				-			float *new_left = &left[block_size*rank];
			
 
				-			float *new_center = &center[block_size*rank];
			
 
				-
			
 
				-			SGEMM("N", "T", dy, new_dx, dz, -1.0f, new_left, ld21, 
			
 
				-				right, ld12, 1.0f, new_center, ld22);
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* CUDA kernel */
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		cublasSgemm('n', 't', dy, dx, dz, 
			
 
				-				-1.0f, left, ld21, right, ld12, 
			
 
				-				 1.0f, center, ld22);
			
 
				-		cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-#endif
			
 
				-
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void chol_cpu_codelet_update_u22(void *descr[], void *_args)
			
 
				-{
			
 
				-	chol_common_cpu_codelet_update_u22(descr, 0, _args);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void chol_cublas_codelet_update_u22(void *descr[], void *_args)
			
 
				-{
			
 
				-	chol_common_cpu_codelet_update_u22(descr, 1, _args);
			
 
				-}
			
 
				-#endif// STARPU_USE_CUDA
			
 
				-
			
 
				-/* 
			
 
				- * U21
			
 
				- */
			
 
				-
			
 
				-static inline void chol_common_codelet_update_u21(void *descr[], int s, __attribute__((unused)) void *_args)
			
 
				-{
			
 
				-//	printf("21\n");
			
 
				-	float *sub11;
			
 
				-	float *sub21;
			
 
				-
			
 
				-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	sub21 = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-
			
 
				-	unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-	unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
			
 
				-
			
 
				-	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
			
 
				-	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
			
 
				-
			
 
				-	switch (s) {
			
 
				-		case 0:
			
 
				-			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				-			break;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		case 1:
			
 
				-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-			break;
			
 
				-#endif
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-void chol_cpu_codelet_update_u21(void *descr[], void *_args)
			
 
				-{
			
 
				-	 chol_common_codelet_update_u21(descr, 0, _args);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void chol_cublas_codelet_update_u21(void *descr[], void *_args)
			
 
				-{
			
 
				-	chol_common_codelet_update_u21(descr, 1, _args);
			
 
				-}
			
 
				-#endif 
			
 
				-
			
 
				-/*
			
 
				- *	U11
			
 
				- */
			
 
				-
			
 
				-static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
			
 
				-{
			
 
				-//	printf("11\n");
			
 
				-	float *sub11;
			
 
				-
			
 
				-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
			
 
				-
			
 
				-	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				-
			
 
				-	unsigned z;
			
 
				-
			
 
				-	switch (s) {
			
 
				-		case 0:
			
 
				-
			
 
				-			/*
			
 
				-			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
			
 
				-			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
			
 
				-			 *	- A22 <- A22 - l21 trans(l21)
			
 
				-			 */
			
 
				-
			
 
				-			for (z = 0; z < nx; z++)
			
 
				-			{
			
 
				-				float lambda11;
			
 
				-				lambda11 = sqrt(sub11[z+z*ld]);
			
 
				-				sub11[z+z*ld] = lambda11;
			
 
				-
			
 
				-				STARPU_ASSERT(lambda11 != 0.0f);
			
 
				-		
			
 
				-				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
			
 
				-		
			
 
				-				SSYR("L", nx - z - 1, -1.0f, 
			
 
				-							&sub11[(z+1)+z*ld], 1,
			
 
				-							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				-			}
			
 
				-			break;
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		case 1:
			
 
				-			{
			
 
				-			float *lambda11;
			
 
				-			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
			
 
				-
			
 
				-			for (z = 0; z < nx; z++)
			
 
				-			{
			
 
				-
			
 
				-				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-
			
 
				-				STARPU_ASSERT(*lambda11 != 0.0f);
			
 
				-				
			
 
				-				*lambda11 = sqrt(*lambda11);
			
 
				-
			
 
				-//				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
			
 
				-				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
			
 
				-
			
 
				-				cublasSscal(nx - z - 1, 1.0f/(*lambda11), &sub11[(z+1)+z*ld], 1);
			
 
				-
			
 
				-				cublasSsyr('U', nx - z - 1, -1.0f,
			
 
				-							&sub11[(z+1)+z*ld], 1,
			
 
				-							&sub11[(z+1)+(z+1)*ld], ld);
			
 
				-			}
			
 
				-
			
 
				-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-			cudaFreeHost(lambda11);
			
 
				-			}
			
 
				-		
			
 
				-
			
 
				-			break;
			
 
				-#endif
			
 
				-		default:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-
			
 
				-void chol_cpu_codelet_update_u11(void *descr[], void *_args)
			
 
				-{
			
 
				-	chol_common_codelet_update_u11(descr, 0, _args);
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-void chol_cublas_codelet_update_u11(void *descr[], void *_args)
			
 
				-{
			
 
				-	chol_common_codelet_update_u11(descr, 1, _args);
			
 
				-}
			
 
				-#endif// STARPU_USE_CUDA
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_models.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_models.c
@@ -1,153 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  Télécom-SudParis
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- * As a convention, in that file, descr[0] is represented by A,
			
 
				- * 				  descr[1] is B ...
			
 
				- */
			
 
				-
			
 
				-/*
			
 
				- *	Number of flops of Gemm 
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-/* #define USE_PERTURBATION	1 */
			
 
				-
			
 
				-#ifdef USE_PERTURBATION
			
 
				-#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
			
 
				-#else
			
 
				-#define PERTURBATE(a)	(a)
			
 
				-#endif
			
 
				-
			
 
				-static double cpu_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-static double cpu_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-static double cpu_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
			
 
				-{
			
 
				-	uint32_t n;
			
 
				-
			
 
				-	n = starpu_matrix_get_nx(descr[0].handle);
			
 
				-
			
 
				-	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
			
 
				-
			
 
				-#ifdef STARPU_MODEL_DEBUG
			
 
				-	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
			
 
				-#endif
			
 
				-
			
 
				-	return PERTURBATE(cost);
			
 
				-}
			
 
				-
			
 
				-struct starpu_perfmodel_t chol_model_11 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_11_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_11_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_11"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel_t chol_model_21 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_21_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_21_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_21"
			
 
				-};
			
 
				-
			
 
				-struct starpu_perfmodel_t chol_model_22 = {
			
 
				-	.per_arch = {
			
 
				-		[STARPU_CPU_DEFAULT][0] = { .cost_model = cpu_chol_task_22_cost },
			
 
				-		[STARPU_CUDA_DEFAULT][0] = { .cost_model = cuda_chol_task_22_cost }
			
 
				-	},
			
 
				-	.type = STARPU_HISTORY_BASED,
			
 
				-	.symbol = "chol_model_22"
			
 
				-};
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_tag.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_tag.c
@@ -1,370 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky.h"
			
 
				-
			
 
				-/*
			
 
				- *	Some useful functions
			
 
				- */
			
 
				-
			
 
				-static struct starpu_task *create_task(starpu_tag_t id)
			
 
				-{
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-		task->cl_arg = NULL;
			
 
				-		task->use_tag = 1;
			
 
				-		task->tag_id = id;
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Create the codelets
			
 
				- */
			
 
				-
			
 
				-static starpu_codelet cl11 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-
			
 
				-static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
			
 
				-{
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG11(k));
			
 
				-	
			
 
				-	task->cl = &cl11;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k);
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-
			
 
				-	/* this is an important task */
			
 
				-	if (!noprio)
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				-	}
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl21 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &chol_model_21
			
 
				-};
			
 
				-
			
 
				-static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				-
			
 
				-	task->cl = &cl21;	
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, k); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				-
			
 
				-	if (!noprio && (j == k+1)) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				-	}
			
 
				-
			
 
				-	int ret = starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				-                exit(0);
			
 
				-        }
			
 
				-
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl22 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				-#endif
			
 
				-	.nbuffers = 3,
			
 
				-	.model = &chol_model_22
			
 
				-};
			
 
				-
			
 
				-static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				-
			
 
				-	task->cl = &cl22;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, k, i); 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = starpu_data_get_sub_data(dataA, 2, k, j); 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = starpu_data_get_sub_data(dataA, 2, i, j); 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if (!noprio && (i == k + 1) && (j == k +1) ) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				-	}
			
 
				-
			
 
				-	int ret = starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				-                exit(0);
			
 
				-        }
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization 
			
 
				- *	and construct the DAG
			
 
				- */
			
 
				-
			
 
				-static void _cholesky(starpu_data_handle dataA, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	struct starpu_task *entry_task = NULL;
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				-	{
			
 
				-		struct starpu_task *task = create_task_11(dataA, k);
			
 
				-		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				-			entry_task = task;
			
 
				-		}
			
 
				-		else {
			
 
				-		  int ret = starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-                        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                                fprintf(stderr, "No worker may execute this task\n");
			
 
				-                                exit(0);
			
 
				-                        }
			
 
				-
			
 
				-		}
			
 
				-		
			
 
				-		for (j = k+1; j<nblocks; j++)
			
 
				-		{
			
 
				-		  create_task_21(dataA, k, j, sched_ctx);
			
 
				-
			
 
				-			for (i = k+1; i<nblocks; i++)
			
 
				-			{
			
 
				-				if (i <= j)
			
 
				-				  create_task_22(dataA, k, i, j, sched_ctx);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* schedule the codelet */
			
 
				-	int ret = starpu_task_submit_to_ctx(entry_task, sched_ctx);
			
 
				-        if (STARPU_UNLIKELY(ret == -ENODEV)) {
			
 
				-                fprintf(stderr, "No worker may execute this task\n");
			
 
				-                exit(0);
			
 
				-        }
			
 
				-
			
 
				-
			
 
				-	/* stall the application until the end of computations */
			
 
				-	starpu_tag_wait(TAG11(nblocks-1));
			
 
				-
			
 
				-	starpu_data_unpartition(dataA, 0);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	fprintf(stderr, "Computation took (in ms)\n");
			
 
				-	printf("%2.2f\n", timing/1000);
			
 
				-
			
 
				-	unsigned n = starpu_matrix_get_nx(dataA);
			
 
				-
			
 
				-	double flop = (1.0f*n*n*n)/3.0f;
			
 
				-	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				-}
			
 
				-
			
 
				-static void initialize_system(float **A, unsigned dim, unsigned pinned)
			
 
				-{
			
 
				-  //	starpu_init(NULL);
			
 
				-	
			
 
				-	starpu_helper_cublas_init();
			
 
				-
			
 
				-	if (pinned)
			
 
				-	{
			
 
				-		starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
			
 
				-	} 
			
 
				-	else {
			
 
				-		*A = malloc(dim*dim*sizeof(float));
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	starpu_data_handle dataA;
			
 
				-
			
 
				-	/* monitor and partition the A matrix into blocks :
			
 
				-	 * one block is now determined by 2 unsigned (i,j) */
			
 
				-	starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
			
 
				-
			
 
				-	starpu_data_set_sequential_consistency_flag(dataA, 0);
			
 
				-
			
 
				-	struct starpu_data_filter f;
			
 
				-		f.filter_func = starpu_vertical_block_filter_func;
			
 
				-		f.nchildren = nblocks;
			
 
				-		f.get_nchildren = NULL;
			
 
				-		f.get_child_ops = NULL;
			
 
				-
			
 
				-	struct starpu_data_filter f2;
			
 
				-		f2.filter_func = starpu_block_filter_func;
			
 
				-		f2.nchildren = nblocks;
			
 
				-		f2.get_nchildren = NULL;
			
 
				-		f2.get_child_ops = NULL;
			
 
				-
			
 
				-	starpu_data_map_filters(dataA, 2, &f, &f2);
			
 
				-
			
 
				-	_cholesky(dataA, nblocks, sched_ctx);
			
 
				-
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				-
			
 
				-	//	starpu_shutdown();
			
 
				-}
			
 
				-
			
 
				-int run_cholesky_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
			
 
				-{
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
			
 
				-	 * */
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	float *mat;
			
 
				-
			
 
				-	mat = malloc(size*size*sizeof(float));
			
 
				-	initialize_system(&mat, size, pinned);
			
 
				-
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < size; i++)
			
 
				-	{
			
 
				-		for (j = 0; j < size; j++)
			
 
				-		{
			
 
				-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
			
 
				-			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	printf("Input :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-	cholesky(mat, size, size, nblocks, sched_ctx);
			
 
				-
			
 
				-#ifdef CHECK_OUTPUT
			
 
				-	printf("Results :\n");
			
 
				-
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-				mat[j+i*size] = 0.0f; // debug
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-
			
 
				-	fprintf(stderr, "compute explicit LLt ...\n");
			
 
				-	float *test_mat = malloc(size*size*sizeof(float));
			
 
				-	STARPU_ASSERT(test_mat);
			
 
				-
			
 
				-	SSYRK("L", "N", size, size, 1.0f, 
			
 
				-				mat, size, 0.0f, test_mat, size);
			
 
				-
			
 
				-	fprintf(stderr, "comparing results ...\n");
			
 
				-	for (j = 0; j < size; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < size; i++)
			
 
				-		{
			
 
				-			if (i <= j) {
			
 
				-				printf("%2.2f\t", test_mat[j +i*size]);
			
 
				-			}
			
 
				-			else {
			
 
				-				printf(".\t");
			
 
				-			}
			
 
				-		}
			
 
				-		printf("\n");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/examples/cholesky_2ctxs/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky_2ctxs/cholesky/cholesky_tile_tag.c
@@ -1,307 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky.h"
			
 
				-
			
 
				-/*
			
 
				- *	Some useful functions
			
 
				- */
			
 
				-
			
 
				-/* A [ y ] [ x ] */
			
 
				-float *ch_A[NMAXBLOCKS][NMAXBLOCKS];
			
 
				-starpu_data_handle ch_A_state[NMAXBLOCKS][NMAXBLOCKS];
			
 
				-
			
 
				-static struct starpu_task *create_task(starpu_tag_t id)
			
 
				-{
			
 
				-	struct starpu_task *task = starpu_task_create();
			
 
				-		task->cl_arg = NULL;
			
 
				-		task->use_tag = 1;
			
 
				-		task->tag_id = id;
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Create the codelets
			
 
				- */
			
 
				-
			
 
				-static starpu_codelet cl11 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u11,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u11,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#ifdef SPU_FUNC_POTRF
			
 
				-	.gordon_func = SPU_FUNC_POTRF,
			
 
				-#else
			
 
				-#warning SPU_FUNC_POTRF is not available
			
 
				-#endif
			
 
				-#endif
			
 
				-	.nbuffers = 1,
			
 
				-	.model = &chol_model_11
			
 
				-};
			
 
				-
			
 
				-static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
			
 
				-{
			
 
				-//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG11(k));
			
 
				-	
			
 
				-	task->cl = &cl11;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = ch_A_state[k][k];
			
 
				-	task->buffers[0].mode = STARPU_RW;
			
 
				-
			
 
				-	/* this is an important task */
			
 
				-	task->priority = STARPU_MAX_PRIO;
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
			
 
				-	}
			
 
				-
			
 
				-	return task;
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl21 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u21,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u21,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#ifdef SPU_FUNC_STRSM
			
 
				-	.gordon_func = SPU_FUNC_STRSM,
			
 
				-#else
			
 
				-#warning SPU_FUNC_STRSM is not available
			
 
				-#endif
			
 
				-#endif
			
 
				-	.nbuffers = 2,
			
 
				-	.model = &chol_model_21
			
 
				-};
			
 
				-
			
 
				-static void create_task_21(unsigned k, unsigned j, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct starpu_task *task = create_task(TAG21(k, j));
			
 
				-
			
 
				-	task->cl = &cl21;	
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = ch_A_state[k][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = ch_A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_RW;
			
 
				-
			
 
				-	if (j == k+1) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-}
			
 
				-
			
 
				-static starpu_codelet cl22 =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
			
 
				-	.cpu_func = chol_cpu_codelet_update_u22,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.cuda_func = chol_cublas_codelet_update_u22,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_GORDON
			
 
				-#ifdef SPU_FUNC_SGEMM
			
 
				-	.gordon_func = SPU_FUNC_SGEMM,
			
 
				-#else
			
 
				-#warning SPU_FUNC_SGEMM is not available
			
 
				-#endif
			
 
				-#endif
			
 
				-	.nbuffers = 3,
			
 
				-	.model = &chol_model_22
			
 
				-};
			
 
				-
			
 
				-static void create_task_22(unsigned k, unsigned i, unsigned j, struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
			
 
				-
			
 
				-	struct starpu_task *task = create_task(TAG22(k, i, j));
			
 
				-
			
 
				-	task->cl = &cl22;
			
 
				-
			
 
				-	/* which sub-data is manipulated ? */
			
 
				-	task->buffers[0].handle = ch_A_state[i][k]; 
			
 
				-	task->buffers[0].mode = STARPU_R;
			
 
				-	task->buffers[1].handle = ch_A_state[j][k]; 
			
 
				-	task->buffers[1].mode = STARPU_R;
			
 
				-	task->buffers[2].handle = ch_A_state[j][i]; 
			
 
				-	task->buffers[2].mode = STARPU_RW;
			
 
				-
			
 
				-	if ( (i == k + 1) && (j == k +1) ) {
			
 
				-		task->priority = STARPU_MAX_PRIO;
			
 
				-	}
			
 
				-
			
 
				-	/* enforce dependencies ... */
			
 
				-	if (k > 0) {
			
 
				-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
			
 
				-	}
			
 
				-	else {
			
 
				-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization 
			
 
				- *	and construct the DAG
			
 
				- */
			
 
				-
			
 
				-static double cholesky_no_stride(struct starpu_sched_ctx *sched_ctx)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	struct starpu_task *entry_task = NULL;
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				-	{
			
 
				-	  struct starpu_task *task = create_task_11(k, nblocks);
			
 
				-		/* we defer the launch of the first task */
			
 
				-		if (k == 0) {
			
 
				-			entry_task = task;
			
 
				-		}
			
 
				-		else {
			
 
				-		  starpu_task_submit_to_ctx(task, sched_ctx);
			
 
				-		}
			
 
				-		
			
 
				-		for (j = k+1; j<nblocks; j++)
			
 
				-		{
			
 
				-		  create_task_21(k, j, sched_ctx);
			
 
				-
			
 
				-			for (i = k+1; i<nblocks; i++)
			
 
				-			{
			
 
				-				if (i <= j)
			
 
				-				  create_task_22(k, i, j, sched_ctx);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* schedule the codelet */
			
 
				-	gettimeofday(&start, NULL);
			
 
				-		
			
 
				-	starpu_task_submit_to_ctx(entry_task, sched_ctx);
			
 
				-
			
 
				-	/* stall the application until the end of computations */
			
 
				-	starpu_tag_wait(TAG11(nblocks-1));
			
 
				-	printf("cholesky finish wait for %d blocks \n", nblocks - 1);
			
 
				-
			
 
				-        gettimeofday(&end, NULL);
			
 
				-
			
 
				-        double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-        double flop = (1.0f*size*size*size)/3.0f;
			
 
				-	return flop/timing/1000.0f;
			
 
				-}
			
 
				-
			
 
				-double run_cholesky_tile_tag(struct starpu_sched_ctx *sched_ctx, int argc, char **argv)
			
 
				-{
			
 
				-	unsigned x, y;
			
 
				-	unsigned i, j;
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-	assert(nblocks <= NMAXBLOCKS);
			
 
				-
			
 
				-	//	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
			
 
				-
			
 
				-	//	starpu_init(NULL);
			
 
				-
			
 
				-	/* Disable sequential consistency */
			
 
				-	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				-
			
 
				-	//	starpu_helper_cublas_init();
			
 
				-
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	{
			
 
				-		if (x <= y) {
			
 
				-			ch_A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-			assert(ch_A[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	{
			
 
				-		if (x <= y) {
			
 
				-#ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				-			posix_memalign((void **)&ch_A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-#else
			
 
				-			ch_A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
			
 
				-#endif
			
 
				-			assert(ch_A[y][x]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* create a simple definite positive symetric matrix example
			
 
				-	 *
			
 
				-	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
			
 
				-	 * */
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	if (x <= y) {
			
 
				-		for (i = 0; i < BLOCKSIZE; i++)
			
 
				-		for (j = 0; j < BLOCKSIZE; j++)
			
 
				-		{
			
 
				-			ch_A[y][x][i*BLOCKSIZE + j] =
			
 
				-				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
			
 
				-
			
 
				-			/* make it a little more numerically stable ... ;) */
			
 
				-			if ((x == y) && (i == j))
			
 
				-				ch_A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-
			
 
				-	for (y = 0; y < nblocks; y++)
			
 
				-	for (x = 0; x < nblocks; x++)
			
 
				-	{
			
 
				-		if (x <= y) {
			
 
				-			starpu_matrix_data_register(&ch_A_state[y][x], 0, (uintptr_t)ch_A[y][x], 
			
 
				-				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return cholesky_no_stride(sched_ctx);
			
 
				-
			
 
				-	//	starpu_shutdown();
			
 
				-	//	return 0;
			
 
				-}
			
--- a/examples/cholesky_2ctxs/cholesky_2ctxs.c
+++ b/examples/cholesky_2ctxs/cholesky_2ctxs.c
@@ -1,231 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2011  INRIA
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "cholesky/cholesky.h"
			
 
				-#include <pthread.h>
			
 
				-
			
 
				-typedef struct {
			
 
				-  int start;
			
 
				-  int argc;
			
 
				-  char **argv;
			
 
				-  unsigned ctx;
			
 
				-  int the_other_ctx;
			
 
				-  int *procs;
			
 
				-  int nprocs;
			
 
				-} params;
			
 
				-
			
 
				-typedef struct {
			
 
				-  double flops;
			
 
				-  double avg_timing;
			
 
				-} retvals;
			
 
				-
			
 
				-#define NSAMPLES 3
			
 
				-int first = 1;
			
 
				-pthread_mutex_t mut;
			
 
				-
			
 
				-pthread_barrier_t barrier;
			
 
				-
			
 
				-void* func_cholesky(void *val){
			
 
				-  params *p = (params*)val;
			
 
				-  unsigned *sched_ctx = &p->ctx;
			
 
				-  int the_other_ctx = p->the_other_ctx;
			
 
				-
			
 
				-  int i;
			
 
				-  retvals *rv  = (retvals*)malloc(sizeof(retvals));
			
 
				-  rv->flops = 0;
			
 
				-  rv->avg_timing = 0;
			
 
				-  double timing = 0;
			
 
				-
			
 
				-  starpu_set_sched_ctx(sched_ctx);
			
 
				-  for(i = 0; i < NSAMPLES; i++)
			
 
				-    {
			
 
				-      rv->flops += run_cholesky_implicit(p->start, p->argc, p->argv, &timing, &barrier);
			
 
				-      rv->avg_timing += timing;
			
 
				-
			
 
				-    }
			
 
				-
			
 
				-
			
 
				-  pthread_mutex_lock(&mut);
			
 
				-  if(first){
			
 
				-      starpu_delete_sched_ctx(p->ctx, the_other_ctx);
			
 
				-  }
			
 
				-
			
 
				-  first = 0;
			
 
				-  pthread_mutex_unlock(&mut);
			
 
				- 
			
 
				-
			
 
				-  rv->flops /= NSAMPLES;
			
 
				-  rv->avg_timing /= NSAMPLES;
			
 
				-  return (void*)rv;
			
 
				-}
			
 
				-
			
 
				-void cholesky_vs_cholesky(params *p1, params *p2, params *p3, 
			
 
				-			  unsigned cpu1, unsigned cpu2,
			
 
				-			  unsigned gpu, unsigned gpu1, unsigned gpu2){
			
 
				-
			
 
				-  int nprocs1 = cpu1 + gpu + gpu1;
			
 
				-  int nprocs2 = cpu2 + gpu + gpu2;
			
 
				-  unsigned n_all_gpus = gpu + gpu1 + gpu2;
			
 
				-
			
 
				-  /* 2 cholesky in different ctxs */
			
 
				-  starpu_init(NULL);
			
 
				-  starpu_helper_cublas_init();
			
 
				-
			
 
				-  int procs[nprocs1];
			
 
				-  int i;
			
 
				-  int k = 0;
			
 
				-
			
 
				-  for(i = 0; i < gpu; i++)
			
 
				-    {
			
 
				-      procs[k++] = i;
			
 
				-      //      printf("%d ", i);
			
 
				-    }
			
 
				-
			
 
				-  for(i = gpu; i < gpu + gpu1; i++)
			
 
				-    {
			
 
				-      procs[k++] = i;
			
 
				-      //printf("%d ", i);
			
 
				-    }
			
 
				-
			
 
				-  for(i = n_all_gpus; i < n_all_gpus + cpu1; i++)
			
 
				-    {
			
 
				-      procs[k++] = i;
			
 
				-      //printf("%d ", i);
			
 
				-    }
			
 
				-  //printf("\n");
			
 
				-
			
 
				-
			
 
				-  p1->ctx = starpu_create_sched_ctx("heft", procs, nprocs1, "cholesky1");
			
 
				-  p2->the_other_ctx = (int)p1->ctx;
			
 
				-  p1->procs = procs;
			
 
				-  p1->nprocs = nprocs1;
			
 
				-  int procs2[nprocs2];
			
 
				-
			
 
				-  k = 0;
			
 
				-
			
 
				-  for(i = 0; i < gpu; i++){
			
 
				-    procs2[k++] = i;
			
 
				-    //    printf("%d ", i);
			
 
				-  }
			
 
				-
			
 
				-  for(i = gpu + gpu1; i < gpu + gpu1 + gpu2; i++){
			
 
				-    procs2[k++] = i;
			
 
				-    //    printf("%d ", i);
			
 
				-  }
			
 
				-
			
 
				-  for(i = n_all_gpus  + cpu1; i < n_all_gpus + cpu1 + cpu2; i++){
			
 
				-    procs2[k++] = i;
			
 
				-    //    printf("%d ", i);
			
 
				-  }
			
 
				-
			
 
				-  //  printf("\n");
			
 
				-
			
 
				-  p2->ctx = starpu_create_sched_ctx("prio", procs2, nprocs2, "cholesky2");
			
 
				-  p1->the_other_ctx = (int)p2->ctx;
			
 
				-  p2->procs = procs2;
			
 
				-  p2->nprocs = nprocs2;
			
 
				-
			
 
				-  pthread_t tid[2];
			
 
				-  pthread_barrier_init(&barrier, NULL, 2);
			
 
				-  pthread_mutex_init(&mut, NULL);
			
 
				-
			
 
				-  struct timeval start;
			
 
				-  struct timeval end;
			
 
				-
			
 
				-  gettimeofday(&start, NULL);
			
 
				-
			
 
				-
			
 
				-  pthread_create(&tid[0], NULL, (void*)func_cholesky, (void*)p1);
			
 
				-  pthread_create(&tid[1], NULL, (void*)func_cholesky, (void*)p2);
			
 
				-
			
 
				-  void *gflops_cholesky1;
			
 
				-  void *gflops_cholesky2;
			
 
				- 
			
 
				-  pthread_join(tid[0], &gflops_cholesky1);
			
 
				-  pthread_join(tid[1], &gflops_cholesky2);
			
 
				-
			
 
				-  gettimeofday(&end, NULL);
			
 
				-
			
 
				-  pthread_mutex_destroy(&mut);
			
 
				-  starpu_helper_cublas_shutdown();
			
 
				-  starpu_shutdown();
			
 
				-  
			
 
				-  double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-  timing /= 1000000;
			
 
				-  //  timing /= 60;
			
 
				-
			
 
				-  printf("%2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, ((retvals*)gflops_cholesky2)->flops);
			
 
				-  printf("%2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, ((retvals*)gflops_cholesky2)->avg_timing, timing);
			
 
				-  /* printf("%2.2f %2.2f ", ((retvals*)gflops_cholesky1)->flops, 0.0 );     */
			
 
				-  /*  printf("%2.2f %2.2f %2.2f\n", ((retvals*)gflops_cholesky1)->avg_timing, 0.0, timing); */
			
 
				-
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-  unsigned cpu1 = 0, cpu2 = 0;
			
 
				-
			
 
				-  unsigned gpu = 0, gpu1 = 0, gpu2 = 0;
			
 
				-  int i;
			
 
				-  
			
 
				-  for (i = 9; i < argc; i++) {
			
 
				-
			
 
				-    if (strcmp(argv[i], "-cpu1") == 0) {
			
 
				-      char *argptr;
			
 
				-      cpu1 = strtol(argv[++i], &argptr, 10);
			
 
				-    }    
			
 
				-
			
 
				-    if (strcmp(argv[i], "-cpu2") == 0) {
			
 
				-      char *argptr;
			
 
				-      cpu2 = strtol(argv[++i], &argptr, 10);
			
 
				-    }    
			
 
				-
			
 
				-    if (strcmp(argv[i], "-gpu") == 0) {
			
 
				-      char *argptr;
			
 
				-      gpu = strtol(argv[++i], &argptr, 10);
			
 
				-    }    
			
 
				-
			
 
				-    if (strcmp(argv[i], "-gpu1") == 0) {
			
 
				-      char *argptr;
			
 
				-      gpu1 = strtol(argv[++i], &argptr, 10);
			
 
				-    }    
			
 
				-
			
 
				-    if (strcmp(argv[i], "-gpu2") == 0) {
			
 
				-      char *argptr;
			
 
				-      gpu2 = strtol(argv[++i], &argptr, 10);
			
 
				-    }    
			
 
				-
			
 
				-
			
 
				-  }
			
 
				-
			
 
				-  params p1;
			
 
				-  p1.start = 1;
			
 
				-  p1.argc = 5;
			
 
				-  p1.argv = argv;
			
 
				-
			
 
				-  params p2;
			
 
				-  p2.start = 5;
			
 
				-  p2.argc = 9;
			
 
				-  p2.argv = argv;
			
 
				-
			
 
				-  params p3;
			
 
				-  p3.argc = argc;
			
 
				-  p3.argv = argv;
			
 
				-  p3.ctx = 0;
			
 
				-  cholesky_vs_cholesky(&p1, &p2,&p3, cpu1, cpu2, gpu, gpu1, gpu2);
			
 
				-
			
 
				-  return 0;
			
 
				-}
			
--- a/examples/common/blas_model.c
+++ b/examples/common/blas_model.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -27,15 +27,15 @@
 
				  *	Number of flops of Gemm 
			
 
				  */
			
 
				 
			
 
				-double gemm_cost(starpu_buffer_descr *descr)
			
 
				+double gemm_cost(struct starpu_task *task, unsigned nimpl)
			
 
				 {
			
 
				 	/* C = A * B */
			
 
				 	uint32_t nxC, nyC, nxA;
			
 
				 
			
 
				 
			
 
				-	nxC = starpu_matrix_get_nx(descr[2].handle);
			
 
				-	nyC = starpu_matrix_get_ny(descr[2].handle);
			
 
				-	nxA = starpu_matrix_get_nx(descr[0].handle);
			
 
				+	nxC = starpu_matrix_get_nx(task->descr[2].handle);
			
 
				+	nyC = starpu_matrix_get_ny(task->descr[2].handle);
			
 
				+	nxA = starpu_matrix_get_nx(task->descr[0].handle);
			
 
				 
			
 
				 /*	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA); */
			
 
				 
			
--- a/examples/common/blas_model.h
+++ b/examples/common/blas_model.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -20,9 +20,10 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-double gemm_cost(starpu_buffer_descr *descr);
			
 
				+double gemm_cost(struct starpu_task *task, unsigned nimpl);
			
 
				 
			
 
				-static struct starpu_perfmodel_t starpu_sgemm_model = {
			
 
				+static struct starpu_perfmodel starpu_sgemm_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = "sgemm_atlas"
			
@@ -33,12 +34,14 @@ static struct starpu_perfmodel_t starpu_sgemm_model = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t starpu_sgemm_model_common = {
			
 
				-	.cost_model = gemm_cost,
			
 
				+static struct starpu_perfmodel starpu_sgemm_model_common =
			
 
				+{
			
 
				+	.cost_function = gemm_cost,
			
 
				 	.type = STARPU_COMMON,
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t starpu_dgemm_model = {
			
 
				+static struct starpu_perfmodel starpu_dgemm_model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 #ifdef STARPU_ATLAS
			
 
				 	.symbol = "dgemm_atlas"
			
@@ -49,8 +52,9 @@ static struct starpu_perfmodel_t starpu_dgemm_model = {
 
				 #endif
			
 
				 };
			
 
				 
			
 
				-static struct starpu_perfmodel_t starpu_dgemm_model_common = {
			
 
				-	.cost_model = gemm_cost,
			
 
				+static struct starpu_perfmodel starpu_dgemm_model_common =
			
 
				+{
			
 
				+	.cost_function = gemm_cost,
			
 
				 	.type = STARPU_COMMON,
			
 
				 };
			
 
				 
			
--- a/examples/cpp/incrementer_cpp.cpp
+++ b/examples/cpp/incrementer_cpp.cpp
@@ -0,0 +1,101 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+#warning todo: fix cuda and opencl
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//extern void cuda_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+//#endif
			
 
				+
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//#include <starpu_opencl.h>
			
 
				+//extern void opencl_codelet(void *descr[], __attribute__ ((unused)) void *_args);
			
 
				+//struct starpu_opencl_program opencl_program;
			
 
				+//#endif
			
 
				+
			
 
				+void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+
			
 
				+	val[0] += 1.0f; val[1] += 1.0f;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret = 0;
			
 
				+	starpu_data_handle_t float_array_handle;
			
 
				+	float float_array[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 0.0f};
			
 
				+        struct starpu_codelet cl;
			
 
				+	unsigned i;
			
 
				+	unsigned niter = 50;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_vector_data_register(&float_array_handle, 0, (uintptr_t)&float_array, 4, sizeof(float));
			
 
				+
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//        ret = starpu_opencl_load_opencl_from_file("examples/incrementer/incrementer_kernels_opencl_kernel.cl", &opencl_program, NULL);
			
 
				+//	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				+//#endif
			
 
				+
			
 
				+        starpu_codelet_init(&cl);
			
 
				+        cl.where = STARPU_CPU;//|STARPU_CUDA;//|STARPU_OPENCL,
			
 
				+        cl.cpu_funcs[0] = cpu_codelet;
			
 
				+//#ifdef STARPU_USE_CUDA
			
 
				+//        cl.cuda_funcs[0] = cuda_codelet;
			
 
				+//#endif
			
 
				+//#ifdef STARPU_USE_OPENCL
			
 
				+//	cl.opencl_funcs[0] = opencl_codelet;
			
 
				+//#endif
			
 
				+        cl.nbuffers = 1;
			
 
				+        cl.modes[0] = STARPU_RW;
			
 
				+
			
 
				+	for (i = 0; i < niter; i++)
			
 
				+	{
			
 
				+		ret = starpu_insert_task(&cl,
			
 
				+					 STARPU_RW, float_array_handle,
			
 
				+					 0);
			
 
				+                if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+                {
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(77);
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* update the array in RAM */
			
 
				+	starpu_data_unregister(float_array_handle);
			
 
				+
			
 
				+	FPRINTF(stderr, "array -> %f, %f, %f, %f\n", float_array[0],
			
 
				+                float_array[1], float_array[2], float_array[3]);
			
 
				+
			
 
				+	if (float_array[0] != niter || float_array[0] != float_array[1] + float_array[2] + float_array[3])
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Incorrect result\n");
			
 
				+		ret = 1;
			
 
				+	}
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
--- a/examples/filters/custom_mf/conversion.cu
+++ b/examples/filters/custom_mf/conversion.cu
@@ -0,0 +1,51 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+static __global__ void custom_cuda(struct point *aop,
			
 
				+				unsigned n,
			
 
				+				float *x,
			
 
				+				float *y)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i < n)
			
 
				+	{
			
 
				+		x[i] = aop[i].x;
			
 
				+		y[i] = aop[i].y;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+	(void) _args;
			
 
				+
			
 
				+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
			
 
				+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
			
 
				+
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+        custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_conversion_program;
			
 
				+
			
 
				+void cpu_to_opencl_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	n*=2;
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_conversion_program,
			
 
				+					"custom_opencl_conversion",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	if (starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					  sizeof(aop), &aop,
			
 
				+					  sizeof(x), &x,
			
 
				+					  sizeof(n), &n,
			
 
				+					  0) != 3)
			
 
				+	{
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		assert(0);
			
 
				+	}
			
 
				+	
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(
			
 
				+				queue,
			
 
				+				kernel,
			
 
				+				1,       /* work_dim */
			
 
				+				NULL,    /* global_work_offset */
			
 
				+				&global, /* global_work_size */
			
 
				+				&local,  /* local_work_size */
			
 
				+				0,       /* num_events_in_wait_list */
			
 
				+				NULL,    /* event_wait_list */
			
 
				+				&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/starpufft/starpufft.c
+++ b/examples/starpufft/starpufft.c
@@ -1,7 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,5 +14,19 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include "double.h"
			
 
				-#include "starpufftx.c"
			
 
				+#include "custom_types.h"
			
 
				+
			
 
				+/*
			
 
				+ * The first n/2 values of x are actual xs. The last N/2 values are ys.
			
 
				+ */
			
 
				+__kernel void custom_opencl_conversion(__global struct point *aop,
			
 
				+				       __global float *x,
			
 
				+				       int nx)
			
 
				+{
			
 
				+        const int i = get_global_id(0);
			
 
				+	if (i < nx/2)
			
 
				+		x[i] = aop[i].x;
			
 
				+	else if (i < nx)
			
 
				+		x[i] = aop[i-nx/2].y;
			
 
				+
			
 
				+}
			
--- a/examples/filters/custom_mf/cuda.cu
+++ b/examples/filters/custom_mf/cuda.cu
@@ -0,0 +1,45 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+static __global__ void scal_cuda(unsigned n,
			
 
				+				 float *x,
			
 
				+				 float *y)
			
 
				+{
			
 
				+        unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+	if (i < n)
			
 
				+		x[i] *= y[i];
			
 
				+}
			
 
				+
			
 
				+extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+	(void) _args;
			
 
				+
			
 
				+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
			
 
				+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
			
 
				+
			
 
				+	unsigned threads_per_block = 64;
			
 
				+	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				+        scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
			
 
				+
			
 
				+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
--- a/examples/filters/custom_mf/custom_conversion_codelets.c
+++ b/examples/filters/custom_mf/custom_conversion_codelets.c
@@ -0,0 +1,95 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 INRIA
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "custom_interface.h"
			
 
				+#include "custom_types.h"
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+void cuda_to_cpu(void *buffers[], void *arg)
			
 
				+{
			
 
				+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	float *x = (float*) CUSTOM_GET_X_PTR(buffers[0]);
			
 
				+	float *y = (float*) CUSTOM_GET_Y_PTR(buffers[0]);
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		aop[i].x = x[i];
			
 
				+		aop[i].y = y[i];
			
 
				+	}
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
			
 
				+struct starpu_codelet cpu_to_cuda_cl =
			
 
				+{
			
 
				+	.where = STARPU_CUDA,
			
 
				+	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cpu_to_cuda"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cuda_to_cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = {cuda_to_cpu, NULL},
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cuda_to_cpu"
			
 
				+};
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+void opencl_to_cpu_cpu_func(void *buffers[], void *arg)
			
 
				+{
			
 
				+	unsigned int n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	float *x = (float *) CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < n; i++)
			
 
				+	{
			
 
				+		aop[i].x = x[i];
			
 
				+		aop[i].y = x[i+n];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
			
 
				+
			
 
				+struct starpu_codelet cpu_to_opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_cpu_to_opencl"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet opencl_to_cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.nbuffers = 1,
			
 
				+	.name = "codelet_opencl_to_cpu"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -0,0 +1,599 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_hash.h>
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+#include "custom_interface.h"
			
 
				+#include "custom_types.h"
			
 
				+
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+				  void *dst_interface, unsigned dst_node,
			
 
				+				  cudaStream_t stream);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				  void *dst_interface, unsigned dst_node,
			
 
				+				  cudaStream_t stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			     void *dst_interface, unsigned dst_node);
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+				   void *dst_interface, unsigned dst_node,
			
 
				+				   cudaStream_t stream);
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node);
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				+				 void *dst_interface, unsigned dst_node);
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event);
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event);
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+static const struct starpu_data_copy_methods custom_copy_data_methods_s =
			
 
				+{
			
 
				+	.ram_to_ram = copy_ram_to_ram,
			
 
				+	.ram_to_spu = NULL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.ram_to_cuda        = copy_ram_to_cuda,
			
 
				+	.cuda_to_ram        = copy_cuda_to_ram,
			
 
				+	.ram_to_cuda_async  = copy_ram_to_cuda_async,
			
 
				+	.cuda_to_ram_async  = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda       = copy_cuda_to_cuda,
			
 
				+	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.ram_to_opencl       = copy_ram_to_opencl,
			
 
				+	.opencl_to_ram       = copy_opencl_to_ram,
			
 
				+	.opencl_to_opencl    = copy_opencl_to_opencl,
			
 
				+        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				+	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				+#endif
			
 
				+	.cuda_to_spu = NULL,
			
 
				+	.spu_to_ram  = NULL,
			
 
				+	.spu_to_cuda = NULL,
			
 
				+	.spu_to_spu  = NULL
			
 
				+};
			
 
				+
			
 
				+static void     register_custom_handle(starpu_data_handle_t handle,
			
 
				+				       uint32_t home_node,
			
 
				+				       void *data_interface);
			
 
				+static ssize_t  allocate_custom_buffer_on_node(void *data_interface_,
			
 
				+					       uint32_t dst_node);
			
 
				+static void*    custom_handle_to_pointer(starpu_data_handle_t data_handle,
			
 
				+					 uint32_t node);
			
 
				+static void     free_custom_buffer_on_node(void *data_interface, uint32_t node);
			
 
				+static size_t   custom_interface_get_size(starpu_data_handle_t handle);
			
 
				+static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle);
			
 
				+static int      custom_compare(void *data_interface_a, void *data_interface_b);
			
 
				+static void     display_custom_interface(starpu_data_handle_t handle, FILE *f);
			
 
				+static uint32_t custom_get_nx(starpu_data_handle_t handle);
			
 
				+
			
 
				+
			
 
				+static struct starpu_multiformat_data_interface_ops*get_mf_ops(void *data_interface)
			
 
				+{
			
 
				+	struct custom_data_interface *custom;
			
 
				+	custom = (struct custom_data_interface *) data_interface;
			
 
				+
			
 
				+	return custom->ops;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_data_interface_ops interface_custom_ops =
			
 
				+{
			
 
				+	.register_data_handle  = register_custom_handle,
			
 
				+	.allocate_data_on_node = allocate_custom_buffer_on_node,
			
 
				+	.handle_to_pointer     = custom_handle_to_pointer,
			
 
				+	.free_data_on_node     = free_custom_buffer_on_node,
			
 
				+	.copy_methods          = &custom_copy_data_methods_s,
			
 
				+	.get_size              = custom_interface_get_size,
			
 
				+	.footprint             = footprint_custom_interface_crc32,
			
 
				+	.compare               = custom_compare,
			
 
				+#ifdef STARPU_USE_GORDON
			
 
				+	.convert_to_gordon     = NULL,
			
 
				+#endif
			
 
				+	.interfaceid           = -1,
			
 
				+	.interface_size        = sizeof(struct custom_data_interface),
			
 
				+	.display               = display_custom_interface,
			
 
				+	.is_multiformat        = 1,
			
 
				+	.get_mf_ops            = get_mf_ops
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+register_custom_handle(starpu_data_handle_t handle, uint32_t home_node, void *data_interface)
			
 
				+{
			
 
				+	struct custom_data_interface *custom_interface;
			
 
				+	custom_interface = (struct custom_data_interface *) data_interface;
			
 
				+
			
 
				+	unsigned node;
			
 
				+	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				+	for (node = 0; node < nnodes; node++)
			
 
				+	{
			
 
				+		struct custom_data_interface *local_interface =
			
 
				+			(struct custom_data_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+		if (node == home_node)
			
 
				+		{
			
 
				+			local_interface->cpu_ptr    = custom_interface->cpu_ptr;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			local_interface->cuda_ptr   = custom_interface->cuda_ptr;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = custom_interface->opencl_ptr;
			
 
				+#endif
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			local_interface->cpu_ptr    = NULL;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			local_interface->cuda_ptr   = NULL;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			local_interface->opencl_ptr = NULL;
			
 
				+#endif
			
 
				+		}
			
 
				+		local_interface->nx = custom_interface->nx;
			
 
				+		local_interface->ops = custom_interface->ops;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static ssize_t allocate_custom_buffer_on_node(void *data_interface, uint32_t node)
			
 
				+{
			
 
				+	ssize_t size = 0;
			
 
				+	struct custom_data_interface *custom_interface;
			
 
				+	custom_interface = (struct custom_data_interface *) data_interface;
			
 
				+
			
 
				+	switch(starpu_node_get_kind(node))
			
 
				+	{
			
 
				+	case STARPU_CPU_RAM:
			
 
				+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				+		custom_interface->cpu_ptr = (void*) malloc(size);
			
 
				+		if (!custom_interface->cpu_ptr)
			
 
				+			return -ENOMEM;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		custom_interface->cuda_ptr = (void *) malloc(size);
			
 
				+		if (!custom_interface->cuda_ptr)
			
 
				+		{
			
 
				+			free(custom_interface->cpu_ptr);
			
 
				+			custom_interface->cpu_ptr = NULL;
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		custom_interface->opencl_ptr = malloc(size);
			
 
				+		if (custom_interface->cuda_ptr == NULL)
			
 
				+		{
			
 
				+			free(custom_interface->cpu_ptr);
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			free(custom_interface->cuda_ptr);
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+			
			
 
				+		break;
			
 
				+#if STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_RAM:
			
 
				+	{
			
 
				+		cudaError_t err;
			
 
				+		size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				+		err = cudaMalloc(&custom_interface->cuda_ptr, size);
			
 
				+		if (err != cudaSuccess)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				+		err = cudaMalloc(&custom_interface->cpu_ptr, size);
			
 
				+		if (err != cudaSuccess)
			
 
				+		{
			
 
				+			cudaFree(custom_interface->cuda_ptr);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case STARPU_OPENCL_RAM:
			
 
				+	{
			
 
				+		cl_int err;
			
 
				+		cl_mem memory;
			
 
				+		ssize_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				+		err = starpu_opencl_allocate_memory(&memory, size, CL_MEM_READ_WRITE);
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+		custom_interface->opencl_ptr = memory;
			
 
				+
			
 
				+		break;
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+	/* XXX We may want to return cpu_size + cuda_size + ... */
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+static void free_custom_buffer_on_node(void *data_interface, uint32_t node)
			
 
				+{
			
 
				+	struct custom_data_interface *custom_interface;
			
 
				+	custom_interface = (struct custom_data_interface *) data_interface;
			
 
				+
			
 
				+	switch(starpu_node_get_kind(node))
			
 
				+	{
			
 
				+	case STARPU_CPU_RAM:
			
 
				+		if (custom_interface->cpu_ptr != NULL)
			
 
				+		{
			
 
				+			free(custom_interface->cpu_ptr);
			
 
				+			custom_interface->cpu_ptr = NULL;
			
 
				+		}
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		if (custom_interface->cuda_ptr != NULL)
			
 
				+		{
			
 
				+			free(custom_interface->cuda_ptr);
			
 
				+			custom_interface->cuda_ptr = NULL;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		if (custom_interface->opencl_ptr != NULL)
			
 
				+		{
			
 
				+			free(custom_interface->opencl_ptr);
			
 
				+			custom_interface->opencl_ptr = NULL;
			
 
				+		}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+		break;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_RAM:
			
 
				+		if (custom_interface->cpu_ptr != NULL)
			
 
				+		{
			
 
				+			cudaError_t err;
			
 
				+			err = cudaFree(custom_interface->cpu_ptr);
			
 
				+			if (err != cudaSuccess)
			
 
				+				fprintf(stderr, "cudaFree failed...\n");
			
 
				+		}
			
 
				+		if (custom_interface->cuda_ptr != NULL)
			
 
				+		{
			
 
				+			cudaError_t err;
			
 
				+			err = cudaFree(custom_interface->cuda_ptr);
			
 
				+			if (err != cudaSuccess)
			
 
				+				fprintf(stderr, "cudaFree failed...\n");
			
 
				+		}
			
 
				+		break;
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void*
			
 
				+custom_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				+{
			
 
				+	struct custom_data_interface *data_interface =
			
 
				+		(struct custom_data_interface *) starpu_data_get_interface_on_node(handle, node);
			
 
				+
			
 
				+
			
 
				+	switch(starpu_node_get_kind(node))
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+			return data_interface->cpu_ptr;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+			return data_interface->cuda_ptr;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			return data_interface->opencl_ptr;
			
 
				+#endif
			
 
				+		default:
			
 
				+			assert(0);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static size_t custom_interface_get_size(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	size_t size;
			
 
				+	struct custom_data_interface *data_interface;
			
 
				+
			
 
				+	data_interface = (struct custom_data_interface *)
			
 
				+				starpu_data_get_interface_on_node(handle, 0);
			
 
				+	size = data_interface->nx * data_interface->ops->cpu_elemsize;
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+static uint32_t footprint_custom_interface_crc32(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	return starpu_crc32_be(custom_get_nx(handle), 0);
			
 
				+}
			
 
				+
			
 
				+static int custom_compare(void *data_interface_a, void *data_interface_b)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				+static void display_custom_interface(starpu_data_handle_t handle, FILE *f)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				+static uint32_t
			
 
				+custom_get_nx(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	struct custom_data_interface *data_interface;
			
 
				+	data_interface = (struct custom_data_interface *)
			
 
				+				starpu_data_get_interface_on_node(handle, 0);
			
 
				+	return data_interface->nx;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void custom_data_register(starpu_data_handle_t *handle,
			
 
				+				 uint32_t home_node,
			
 
				+				 void *ptr,
			
 
				+				 uint32_t nx,
			
 
				+				 struct starpu_multiformat_data_interface_ops *format_ops)
			
 
				+{
			
 
				+	/* XXX Deprecated fields ? */
			
 
				+	struct custom_data_interface custom =
			
 
				+	{
			
 
				+		.cpu_ptr = ptr,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+		.cuda_ptr = NULL,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		.opencl_ptr = NULL,
			
 
				+#endif
			
 
				+		.nx  = nx,
			
 
				+		.ops = format_ops
			
 
				+	};
			
 
				+
			
 
				+	if (interface_custom_ops.interfaceid == -1) {
			
 
				+		interface_custom_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+	}
			
 
				+	starpu_data_register(handle, home_node, &custom, &interface_custom_ops);
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_ram(void *src_interface, unsigned src_node,
			
 
				+			   void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				+			    void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	/* TODO */
			
 
				+	assert(0);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+copy_cuda_common_async(void *src_interface, unsigned src_node,
			
 
				+		       void *dst_interface, unsigned dst_node,
			
 
				+		       cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+	struct custom_data_interface *src_custom, *dst_custom;
			
 
				+
			
 
				+	src_custom = (struct custom_data_interface *) src_interface;
			
 
				+	dst_custom = (struct custom_data_interface *) dst_interface;
			
 
				+
			
 
				+	ssize_t size = 0;
			
 
				+	cudaError_t err;
			
 
				+
			
 
				+	switch (kind)
			
 
				+	{
			
 
				+	case cudaMemcpyHostToDevice:
			
 
				+	{
			
 
				+		size = src_custom->nx * src_custom->ops->cpu_elemsize;
			
 
				+		if (dst_custom->cpu_ptr == NULL)
			
 
				+		{
			
 
				+			err = cudaMalloc(&dst_custom->cpu_ptr, size);
			
 
				+			assert(err == cudaSuccess);
			
 
				+		}
			
 
				+
			
 
				+		err = cudaMemcpyAsync(dst_custom->cpu_ptr,
			
 
				+				      src_custom->cpu_ptr,
			
 
				+				      size, kind, stream);
			
 
				+		assert(err == cudaSuccess);
			
 
				+
			
 
				+
			
 
				+		err = cudaMalloc(&dst_custom->cuda_ptr, size);
			
 
				+		assert(err == cudaSuccess);
			
 
				+		break;
			
 
				+	}
			
 
				+	case cudaMemcpyDeviceToHost:
			
 
				+		size = 2*src_custom->nx*sizeof(float);
			
 
				+		if (dst_custom->cuda_ptr == NULL)
			
 
				+		{
			
 
				+			dst_custom->cuda_ptr = malloc(size);
			
 
				+			if (dst_custom->cuda_ptr == NULL)
			
 
				+				return -ENOMEM;
			
 
				+		}
			
 
				+		err = cudaMemcpyAsync(dst_custom->cuda_ptr,
			
 
				+				      src_custom->cuda_ptr,
			
 
				+				      size, kind, stream);
			
 
				+		assert(err == cudaSuccess);
			
 
				+		break;
			
 
				+	default:
			
 
				+		assert(0);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+				  void *dst_interface, unsigned dst_node,
			
 
				+				  cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node,
			
 
				+				      dst_interface, dst_node,
			
 
				+				      stream, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				  void *dst_interface, unsigned dst_node,
			
 
				+				  cudaStream_t stream)
			
 
				+{
			
 
				+	return copy_cuda_common_async(src_interface, src_node,
			
 
				+				      dst_interface, dst_node,
			
 
				+				      stream, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node,
			
 
				+			     void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	assert(0);
			
 
				+}
			
 
				+static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				+				   void *dst_interface, unsigned dst_node,
			
 
				+				   cudaStream_t stream)
			
 
				+{
			
 
				+	assert(0);
			
 
				+}
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static int copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				+			      void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				+				 void *dst_interface, unsigned dst_node)
			
 
				+{
			
 
				+	(void) src_interface;
			
 
				+	(void) src_node;
			
 
				+	(void) dst_interface;
			
 
				+	(void) dst_node;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event)
			
 
				+{
			
 
				+	ssize_t size;
			
 
				+	struct custom_data_interface *src_custom, *dst_custom;
			
 
				+
			
 
				+	src_custom = (struct custom_data_interface *) src_interface;
			
 
				+	dst_custom = (struct custom_data_interface *) dst_interface;
			
 
				+
			
 
				+	/*
			
 
				+	 * Opencl stuff.
			
 
				+	 */
			
 
				+	cl_context context;
			
 
				+	cl_command_queue queue;
			
 
				+	int id = starpu_worker_get_id();
			
 
				+	int devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_queue(devid, &queue);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	/* Real stuff */
			
 
				+	int err;
			
 
				+	cl_int ret;
			
 
				+
			
 
				+	size = src_custom->nx * 2 * sizeof(float);
			
 
				+	if (dst_custom->cpu_ptr == NULL)
			
 
				+	{
			
 
				+		ret = starpu_opencl_allocate_memory((cl_mem*)&dst_custom->cpu_ptr,
			
 
				+				size, CL_MEM_READ_WRITE);
			
 
				+		assert(ret == CL_SUCCESS);
			
 
				+	}
			
 
				+	err = starpu_opencl_copy_ram_to_opencl_async_sync(src_custom->cpu_ptr,
			
 
				+							  src_node,
			
 
				+							  dst_custom->cpu_ptr,
			
 
				+							  dst_node,
			
 
				+							  size,
			
 
				+							  0,
			
 
				+							  NULL,
			
 
				+							  &ret);
			
 
				+	assert(err == 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				+				    void *dst_interface, unsigned dst_node,
			
 
				+				    void *event)
			
 
				+{
			
 
				+	ssize_t size;
			
 
				+	struct custom_data_interface *src_custom, *dst_custom;
			
 
				+
			
 
				+	src_custom = (struct custom_data_interface *) src_interface;
			
 
				+	dst_custom = (struct custom_data_interface *) dst_interface;
			
 
				+
			
 
				+	/*
			
 
				+	 * Opencl stuff.
			
 
				+	 */
			
 
				+	cl_context context;
			
 
				+	cl_command_queue queue;
			
 
				+	int id = starpu_worker_get_id();
			
 
				+	int devid = starpu_worker_get_devid(id);
			
 
				+	starpu_opencl_get_queue(devid, &queue);
			
 
				+	starpu_opencl_get_context(devid, &context);
			
 
				+
			
 
				+	/* real stuff */
			
 
				+	int err;
			
 
				+	cl_int ret;
			
 
				+	size = src_custom->nx * 2 * sizeof(float);
			
 
				+	if (!dst_custom->opencl_ptr)
			
 
				+	{
			
 
				+		dst_custom->opencl_ptr = malloc(size);
			
 
				+		assert(dst_custom->opencl_ptr != NULL);
			
 
				+	}
			
 
				+
			
 
				+	err = starpu_opencl_copy_opencl_to_ram_async_sync(
			
 
				+			src_custom->opencl_ptr,
			
 
				+			src_node,
			
 
				+			dst_custom->opencl_ptr,
			
 
				+			dst_node,
			
 
				+			size,
			
 
				+			0,
			
 
				+			NULL,
			
 
				+			&ret);
			
 
				+	assert(err == 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
--- a/examples/filters/custom_mf/custom_interface.h
+++ b/examples/filters/custom_mf/custom_interface.h
@@ -0,0 +1,48 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#ifndef __CUSTOM_INTERFACE_H__
			
 
				+#define __CUSTOM_INTERFACE_H__
			
 
				+#include <starpu.h>
			
 
				+struct custom_data_interface
			
 
				+{
			
 
				+	void *cpu_ptr;
			
 
				+	void *cuda_ptr;
			
 
				+	void *opencl_ptr;
			
 
				+	struct starpu_multiformat_data_interface_ops *ops;
			
 
				+	uint32_t nx;
			
 
				+};
			
 
				+
			
 
				+void custom_data_register(starpu_data_handle_t *handle,
			
 
				+				 uint32_t home_node,
			
 
				+				 void *ptr,
			
 
				+				 uint32_t nx,
			
 
				+				 struct starpu_multiformat_data_interface_ops* ops);
			
 
				+
			
 
				+#define CUSTOM_GET_NX(interface) (((struct custom_data_interface*)(interface))->nx)
			
 
				+#define CUSTOM_GET_CPU_PTR(interface) (((struct custom_data_interface*)(interface))->cpu_ptr)
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#define CUSTOM_GET_X_PTR(interface) (((struct custom_data_interface*)(interface))->cuda_ptr)
			
 
				+#define CUSTOM_GET_Y_PTR(interface) \
			
 
				+	(((struct custom_data_interface*)(interface))->cuda_ptr)+ \
			
 
				+	CUSTOM_GET_NX((interface))
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#define CUSTOM_GET_OPENCL_X_PTR(interface) (((struct custom_data_interface *)(interface))->opencl_ptr)
			
 
				+#endif
			
 
				+
			
 
				+#endif /* ! __CUSTOM_INTERFACE_H__ */
			
--- a/examples/filters/custom_mf/custom_mf_filter.c
+++ b/examples/filters/custom_mf/custom_mf_filter.c
@@ -0,0 +1,331 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <starpu.h>
			
 
				+#include "custom_interface.h"
			
 
				+#include "custom_types.h"
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+#define N 12
			
 
				+
			
 
				+#define DEBUG 1
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static unsigned int ncuda;
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+static unsigned int nopencl;
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+static struct point array_of_structs[N];
			
 
				+static starpu_data_handle_t handle;
			
 
				+static unsigned int nchunks = 6;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern struct starpu_codelet cpu_to_cuda_cl;
			
 
				+extern struct starpu_codelet cuda_to_cpu_cl;
			
 
				+#endif
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern struct starpu_codelet cpu_to_opencl_cl;
			
 
				+extern struct starpu_codelet opencl_to_cpu_cl;
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_multiformat_data_interface_ops format_ops =
			
 
				+{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_elemsize = sizeof(struct struct_of_arrays),
			
 
				+	.cpu_to_cuda_cl = &cpu_to_cuda_cl,
			
 
				+	.cuda_to_cpu_cl = &cuda_to_cpu_cl,
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	.opencl_elemsize  = sizeof(struct struct_of_arrays),
			
 
				+	.cpu_to_opencl_cl = &cpu_to_opencl_cl,
			
 
				+	.opencl_to_cpu_cl = &opencl_to_cpu_cl,
			
 
				+#endif
			
 
				+	.cpu_elemsize = sizeof(struct point),
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+custom_filter(void *father, void *child, struct starpu_data_filter *f,
			
 
				+		unsigned id, unsigned nchunks)
			
 
				+{
			
 
				+	struct custom_data_interface *custom_father, *custom_child;
			
 
				+	custom_father = (struct custom_data_interface *) father;
			
 
				+	custom_child = (struct custom_data_interface *) child;
			
 
				+
			
 
				+	assert(N % nchunks == 0); // XXX 
			
 
				+	ssize_t chunk_size = N/nchunks;
			
 
				+
			
 
				+	if (custom_father->cpu_ptr)
			
 
				+	{
			
 
				+		struct point *tmp = (struct point *) custom_father->cpu_ptr;
			
 
				+		tmp += id * chunk_size;
			
 
				+		custom_child->cpu_ptr = tmp;
			
 
				+	}
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	else if (custom_father->cuda_ptr)
			
 
				+	{
			
 
				+		struct struct_of_arrays *soa_father, *soa_child;
			
 
				+		soa_father = (struct struct_of_arrays*) custom_father->cuda_ptr;
			
 
				+		soa_child = (struct struct_of_arrays*) custom_child->cuda_ptr;
			
 
				+		soa_child->x = soa_father->x + chunk_size;
			
 
				+		soa_child->y = soa_father->y + chunk_size;
			
 
				+	}
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	else if (custom_father->opencl_ptr)
			
 
				+	{
			
 
				+		struct struct_of_arrays *soa_father, *soa_child;
			
 
				+		soa_father = (struct struct_of_arrays*) custom_father->opencl_ptr;
			
 
				+		soa_child = (struct struct_of_arrays*) custom_child->opencl_ptr;
			
 
				+		soa_child->x = soa_father->x + chunk_size;
			
 
				+		soa_child->y = soa_father->y + chunk_size;
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+	custom_child->ops = custom_father->ops;
			
 
				+	custom_child->nx = chunk_size;
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+register_and_partition_data(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N; i++)
			
 
				+	{
			
 
				+		array_of_structs[i].x = i+1.0;
			
 
				+		array_of_structs[i].y = 42.0;
			
 
				+	}
			
 
				+	custom_data_register(&handle, 0, &array_of_structs, N, &format_ops);
			
 
				+
			
 
				+	struct starpu_data_filter f =
			
 
				+	{
			
 
				+		.filter_func   = custom_filter,
			
 
				+		.nchildren     = nchunks,
			
 
				+		.get_nchildren = NULL,
			
 
				+		.get_child_ops = NULL
			
 
				+	};
			
 
				+	starpu_data_partition(handle, &f);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+unpartition_and_unregister_data(void)
			
 
				+{
			
 
				+	starpu_data_unpartition(handle, 0);
			
 
				+	starpu_data_unregister(handle);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+custom_scal_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	struct point *aos;
			
 
				+	unsigned int n, i;
			
 
				+
			
 
				+	aos = CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+	n = CUSTOM_GET_NX(buffers[0]);
			
 
				+
			
 
				+	for (i = 0; i < n; i++)
			
 
				+		aos[i].x *= aos[i].y;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void custom_scal_cuda_func(void *buffers[], void *args);
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_codelet cpu_cl =
			
 
				+{
			
 
				+	.where = STARPU_CPU,
			
 
				+	.cpu_funcs = { custom_scal_cpu_func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "codelet_real"
			
 
				+};
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static struct starpu_codelet cuda_cl =
			
 
				+{
			
 
				+	.where = STARPU_CUDA,
			
 
				+	.cuda_funcs = { custom_scal_cuda_func, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "cuda_codelet"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern void custom_scal_opencl_func(void *buffers[], void *args);
			
 
				+
			
 
				+static struct starpu_codelet opencl_cl =
			
 
				+{
			
 
				+	.where = STARPU_OPENCL,
			
 
				+	.opencl_funcs = { custom_scal_opencl_func, NULL },
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.name = "opencl_codelet"
			
 
				+};
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+static int
			
 
				+create_and_submit_tasks(void)
			
 
				+{
			
 
				+	int err;
			
 
				+	unsigned int i;
			
 
				+	for (i = 0; i < nchunks; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		switch (i%3)
			
 
				+		{
			
 
				+		case 0:
			
 
				+			task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+		case 1:
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+			if (ncuda > 0)
			
 
				+				task->cl = &cuda_cl;
			
 
				+			else
			
 
				+#endif
			
 
				+				task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+		case 2:
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+			if (nopencl > 0)
			
 
				+				task->cl = &opencl_cl;
			
 
				+			else
			
 
				+#endif
			
 
				+				task->cl = &cpu_cl;
			
 
				+			break;
			
 
				+		default:
			
 
				+			/* We should never get here */
			
 
				+			assert(0);
			
 
				+		}
			
 
				+
			
 
				+		task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
			
 
				+		err = starpu_task_submit(task);
			
 
				+		if (err != 0)
			
 
				+			return err;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	err = starpu_task_wait_for_all();
			
 
				+	if (err != 0)
			
 
				+		return err;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+#if DEBUG
			
 
				+static void
			
 
				+print_it(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N; i++)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "(%.2f, %.2f) ",
			
 
				+			array_of_structs[i].x,
			
 
				+			array_of_structs[i].y);
			
 
				+	}
			
 
				+	FPRINTF(stderr, "\n");
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static int
			
 
				+check_it(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N; i++)
			
 
				+	{
			
 
				+		float expected_value = (i + 1.0)*42.0;
			
 
				+		if (array_of_structs[i].x != expected_value)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+struct starpu_opencl_program opencl_program;
			
 
				+struct starpu_opencl_program opencl_conversion_program;
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+#ifndef STARPU_USE_CPU
			
 
				+	return 77;
			
 
				+#else
			
 
				+	int err;
			
 
				+
			
 
				+	err = starpu_init(NULL);
			
 
				+	if (err == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	ncuda = starpu_cuda_worker_get_count();
			
 
				+#endif /* !STARPU_USE_CUDA */
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	nopencl = starpu_opencl_worker_get_count();
			
 
				+	if (nopencl > 0)
			
 
				+	{
			
 
				+		char *f1 = "examples/filters/custom_mf/custom_opencl.cl";
			
 
				+		char *f2 = "examples/filters/custom_mf/conversion_opencl.cl";
			
 
				+		err = starpu_opencl_load_opencl_from_file(f1, &opencl_program,
			
 
				+							  NULL);
			
 
				+		assert(err == 0);
			
 
				+		err = starpu_opencl_load_opencl_from_file(f2,
			
 
				+						&opencl_conversion_program,
			
 
				+						NULL);
			
 
				+		assert(err == 0);
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+
			
 
				+	register_and_partition_data();
			
 
				+#if DEBUG
			
 
				+	print_it();
			
 
				+#endif
			
 
				+	err = create_and_submit_tasks();
			
 
				+	if (err != 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "create_submit_task : %s\n",
			
 
				+			strerror(-err));
			
 
				+		return EXIT_FAILURE;
			
 
				+	}
			
 
				+	unpartition_and_unregister_data();
			
 
				+#if DEBUG
			
 
				+	print_it();
			
 
				+#endif
			
 
				+
			
 
				+#if STARPU_USE_OPENCL
			
 
				+	if (nopencl > 0)
			
 
				+	{
			
 
				+        	err = starpu_opencl_unload_opencl(&opencl_program);
			
 
				+		assert(err == 0);
			
 
				+		err = starpu_opencl_unload_opencl(&opencl_conversion_program);
			
 
				+		assert(err == 0);
			
 
				+	}
			
 
				+#endif /* !STARPU_USE_OPENCL */
			
 
				+	starpu_shutdown();		
			
 
				+	print_it();
			
 
				+	return check_it();
			
 
				+
			
 
				+
			
 
				+enodev:
			
 
				+	return 77;
			
 
				+#endif
			
 
				+}
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -0,0 +1,101 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include "custom_types.h"
			
 
				+#include "custom_interface.h"
			
 
				+
			
 
				+extern struct starpu_opencl_program opencl_program;
			
 
				+
			
 
				+void custom_scal_opencl_func(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) args;
			
 
				+	int id, devid;
			
 
				+        cl_int err;
			
 
				+	cl_kernel kernel;
			
 
				+	cl_command_queue queue;
			
 
				+	cl_event event;
			
 
				+
			
 
				+	unsigned n = CUSTOM_GET_NX(buffers[0]);
			
 
				+	struct point *aop;
			
 
				+	aop = (struct point *) CUSTOM_GET_CPU_PTR(buffers[0]);
			
 
				+
			
 
				+	id = starpu_worker_get_id();
			
 
				+	devid = starpu_worker_get_devid(id);
			
 
				+
			
 
				+	err = starpu_opencl_load_kernel(&kernel,
			
 
				+					&queue,
			
 
				+					&opencl_program,
			
 
				+					"custom_scal_opencl",
			
 
				+					devid);
			
 
				+	if (err != CL_SUCCESS)
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+
			
 
				+	void *x = CUSTOM_GET_OPENCL_X_PTR(buffers[0]);
			
 
				+	if (starpu_opencl_set_kernel_args(&err, &kernel,
			
 
				+					  sizeof(aop), &aop,
			
 
				+					  sizeof(x), &x,
			
 
				+					  sizeof(n), &n,
			
 
				+					  0) != 3)
			
 
				+	{
			
 
				+		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+		assert(0);
			
 
				+	}
			
 
				+	
			
 
				+
			
 
				+	{
			
 
				+		size_t global=n;
			
 
				+		size_t local;
			
 
				+                size_t s;
			
 
				+                cl_device_id device;
			
 
				+
			
 
				+                starpu_opencl_get_device(devid, &device);
			
 
				+
			
 
				+                err = clGetKernelWorkGroupInfo (kernel,
			
 
				+						device,
			
 
				+						CL_KERNEL_WORK_GROUP_SIZE,
			
 
				+						sizeof(local),
			
 
				+						&local,
			
 
				+						&s);
			
 
				+                if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+
			
 
				+                if (local > global)
			
 
				+			local = global;
			
 
				+
			
 
				+		err = clEnqueueNDRangeKernel(
			
 
				+				queue,
			
 
				+				kernel,
			
 
				+				1,       /* work_dim */
			
 
				+				NULL,    /* global_work_offset */
			
 
				+				&global, /* global_work_size */
			
 
				+				&local,  /* local_work_size */
			
 
				+				0,       /* num_events_in_wait_list */
			
 
				+				NULL,    /* event_wait_list */
			
 
				+				&event);
			
 
				+
			
 
				+		if (err != CL_SUCCESS)
			
 
				+			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+	}
			
 
				+
			
 
				+	clFinish(queue);
			
 
				+	starpu_opencl_collect_stats(event);
			
 
				+	clReleaseEvent(event);
			
 
				+
			
 
				+	starpu_opencl_release_kernel(kernel);
			
 
				+}
			
--- a/examples/starpufft/starpufftf.c
+++ b/examples/starpufft/starpufftf.c