Browse Source

create a trunk/, branches/ and a tags/ directory

Cédric Augonnet 16 years ago
commit
fc22dad676
100 changed files with 28479 additions and 0 deletions
  1. 2 0
      AUTHORS
  2. 510 0
      COPYING.LGPL
  3. 33 0
      Makefile.am
  4. 142 0
      build-aux/compile
  5. 1526 0
      build-aux/config.guess
  6. 1658 0
      build-aux/config.sub
  7. 589 0
      build-aux/depcomp
  8. 269 0
      build-aux/install-sh
  9. 6964 0
      build-aux/ltmain.sh
  10. 367 0
      build-aux/missing
  11. 444 0
      configure.ac
  12. 214 0
      examples/Makefile.am
  13. 46 0
      examples/cholesky/Makefile.in
  14. 357 0
      examples/cholesky/dw_cholesky.c
  15. 101 0
      examples/cholesky/dw_cholesky.h
  16. 195 0
      examples/cholesky/dw_cholesky_kernels.c
  17. 152 0
      examples/cholesky/dw_cholesky_models.c
  18. 22 0
      examples/cholesky/dw_cholesky_models.h
  19. 306 0
      examples/cholesky/dw_cholesky_no_stride.c
  20. 30 0
      examples/common/Makefile.in
  21. 241 0
      examples/common/blas.c
  22. 97 0
      examples/common/blas.h
  23. 46 0
      examples/common/blas_model.c
  24. 41 0
      examples/common/blas_model.h
  25. 27 0
      examples/cuda/incrementer_cuda.cu
  26. 23 0
      examples/cuda/incrementer_cuda.h
  27. 110 0
      examples/cuda/spmv_cuda.cu
  28. 28 0
      examples/fortran/bindings/Makefile
  29. 54 0
      examples/fortran/bindings/StarPU-fortran.h
  30. 33 0
      examples/fortran/bindings/hello-c.c
  31. 35 0
      examples/fortran/bindings/hello.F
  32. 41 0
      examples/heat/Makefile.in
  33. 749 0
      examples/heat/dw_factolu.c
  34. 212 0
      examples/heat/dw_factolu.h
  35. 293 0
      examples/heat/dw_factolu_kernels.c
  36. 303 0
      examples/heat/dw_factolu_tag.c
  37. 466 0
      examples/heat/dw_sparse_cg.c
  38. 136 0
      examples/heat/dw_sparse_cg.h
  39. 423 0
      examples/heat/dw_sparse_cg_kernels.c
  40. 766 0
      examples/heat/heat.c
  41. 70 0
      examples/heat/heat.h
  42. 283 0
      examples/heat/heat_display.c
  43. 277 0
      examples/heat/lu_kernels_model.c
  44. 22 0
      examples/heat/lu_kernels_model.h
  45. 39 0
      examples/incrementer/Makefile.in
  46. 178 0
      examples/incrementer/incrementer.c
  47. 46 0
      examples/mult/Makefile.in
  48. 332 0
      examples/mult/dw_mult.c
  49. 169 0
      examples/mult/dw_mult.h
  50. 461 0
      examples/mult/dw_mult_no_stride.c
  51. 467 0
      examples/mult/dw_mult_no_stride_no_tag.c
  52. 28 0
      examples/pastix-wrappers/Makefile
  53. 34 0
      examples/pastix-wrappers/generated_model.h
  54. 59 0
      examples/pastix-wrappers/models/Makefile
  55. 31 0
      examples/pastix-wrappers/models/model.sh
  56. 26 0
      examples/pastix-wrappers/models/num_recipes/complex.h
  57. 20 0
      examples/pastix-wrappers/models/num_recipes/covsrt.c
  58. 60 0
      examples/pastix-wrappers/models/num_recipes/gaussj.c
  59. 66 0
      examples/pastix-wrappers/models/num_recipes/lfit.c
  60. 530 0
      examples/pastix-wrappers/models/num_recipes/nr.h
  61. 295 0
      examples/pastix-wrappers/models/num_recipes/nrutil.c
  62. 79 0
      examples/pastix-wrappers/models/num_recipes/nrutil.h
  63. 141 0
      examples/pastix-wrappers/models/reg_gemm.c
  64. 122 0
      examples/pastix-wrappers/models/reg_trsm.c
  65. 755 0
      examples/pastix-wrappers/starpu-blas-wrapper.c
  66. 108 0
      examples/pastix-wrappers/starpu-blas-wrapper.h
  67. 46 0
      examples/spmv/Makefile.in
  68. 288 0
      examples/spmv/dw_block_spmv.c
  69. 41 0
      examples/spmv/dw_block_spmv.h
  70. 64 0
      examples/spmv/dw_block_spmv_kernels.c
  71. 349 0
      examples/spmv/dw_spmv.c
  72. 31 0
      examples/spmv/dw_spmv.h
  73. 20 0
      examples/spmv/matrix-market/example_read.c
  74. 522 0
      examples/spmv/matrix-market/examples/fidapm05.mtx
  75. 346 0
      examples/spmv/matrix-market/mm_to_bcsr.c
  76. 36 0
      examples/spmv/matrix-market/mm_to_bcsr.h
  77. 512 0
      examples/spmv/matrix-market/mmio.c
  78. 133 0
      examples/spmv/matrix-market/mmio.h
  79. 43 0
      examples/strassen/Makefile.in
  80. 515 0
      examples/strassen/strassen.c
  81. 114 0
      examples/strassen/strassen.h
  82. 198 0
      examples/strassen/strassen_kernels.c
  83. 156 0
      examples/strassen/strassen_models.c
  84. 22 0
      examples/strassen/strassen_models.h
  85. 188 0
      examples/strassen/test_strassen.c
  86. 37 0
      examples/strassen2/Makefile.in
  87. 833 0
      examples/strassen2/strassen2.c
  88. 242 0
      examples/strassen2/strassen2_kernels.c
  89. 38 0
      examples/tag_example/Makefile.in
  90. 207 0
      examples/tag_example/tag_example.c
  91. 149 0
      examples/tag_example/tag_example2.c
  92. 51 0
      include/starpu-data-filters.h
  93. 133 0
      include/starpu-data-interfaces.h
  94. 47 0
      include/starpu-data.h
  95. 32 0
      include/starpu-mutex.h
  96. 94 0
      include/starpu-perfmodel.h
  97. 144 0
      include/starpu-task.h
  98. 63 0
      include/starpu-util.h
  99. 36 0
      include/starpu.h
  100. 0 0
      include/starpu_config.h.in

+ 2 - 0
AUTHORS

@@ -0,0 +1,2 @@
+Cédric Augonnet <cedric.augonnet@inria.fr>
+Samuel Thibault <samuel.thibault@labri.fr>

+ 510 - 0
COPYING.LGPL

@@ -0,0 +1,510 @@
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+	51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations
+below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it
+becomes a de-facto standard.  To achieve this, non-free programs must
+be allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+                  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control
+compilation and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at least
+    three years, to give the same user the materials specified in
+    Subsection 6a, above, for a charge no more than the cost of
+    performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply, and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License
+may add an explicit geographical distribution limitation excluding those
+countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+                            NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms
+of the ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.
+It is safest to attach them to the start of each source file to most
+effectively convey the exclusion of warranty; and each file should
+have at least the "copyright" line and a pointer to where the full
+notice is found.
+
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or
+your school, if any, to sign a "copyright disclaimer" for the library,
+if necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James
+  Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
+
+

+ 33 - 0
Makefile.am

@@ -0,0 +1,33 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+.PHONY: examples
+
+SUBDIRS = src tools examples tests 
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpu.pc
+
+include_HEADERS = 				\
+	include/starpu.h			\
+	include/starpu_config.h			\
+	include/starpu-data-filters.h		\
+	include/starpu-data-interfaces.h	\
+	include/starpu-mutex.h			\
+	include/starpu-task.h			\
+	include/starpu-data.h			\
+	include/starpu-perfmodel.h		\
+	include/starpu-util.h

+ 142 - 0
build-aux/compile

@@ -0,0 +1,142 @@
+#! /bin/sh
+# Wrapper for compilers which do not understand `-c -o'.
+
+scriptversion=2005-05-14.22
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005 Free Software Foundation, Inc.
+# Written by Tom Tromey <tromey@cygnus.com>.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+case $1 in
+  '')
+     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: compile [--help] [--version] PROGRAM [ARGS]
+
+Wrapper for compilers which do not understand `-c -o'.
+Remove `-o dest.o' from ARGS, run PROGRAM with the remaining
+arguments, and rename the output as expected.
+
+If you are trying to build a whole package this is not the
+right script to run: please start by reading the file `INSTALL'.
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "compile $scriptversion"
+    exit $?
+    ;;
+esac
+
+ofile=
+cfile=
+eat=
+
+for arg
+do
+  if test -n "$eat"; then
+    eat=
+  else
+    case $1 in
+      -o)
+	# configure might choose to run compile as `compile cc -o foo foo.c'.
+	# So we strip `-o arg' only if arg is an object.
+	eat=1
+	case $2 in
+	  *.o | *.obj)
+	    ofile=$2
+	    ;;
+	  *)
+	    set x "$@" -o "$2"
+	    shift
+	    ;;
+	esac
+	;;
+      *.c)
+	cfile=$1
+	set x "$@" "$1"
+	shift
+	;;
+      *)
+	set x "$@" "$1"
+	shift
+	;;
+    esac
+  fi
+  shift
+done
+
+if test -z "$ofile" || test -z "$cfile"; then
+  # If no `-o' option was seen then we might have been invoked from a
+  # pattern rule where we don't need one.  That is ok -- this is a
+  # normal compilation that the losing compiler can handle.  If no
+  # `.c' file was seen then we are probably linking.  That is also
+  # ok.
+  exec "$@"
+fi
+
+# Name of file we expect compiler to create.
+cofile=`echo "$cfile" | sed -e 's|^.*/||' -e 's/\.c$/.o/'`
+
+# Create the lock directory.
+# Note: use `[/.-]' here to ensure that we don't use the same name
+# that we are using for the .o file.  Also, base the name on the expected
+# object file name, since that is what matters with a parallel build.
+lockdir=`echo "$cofile" | sed -e 's|[/.-]|_|g'`.d
+while true; do
+  if mkdir "$lockdir" >/dev/null 2>&1; then
+    break
+  fi
+  sleep 1
+done
+# FIXME: race condition here if user kills between mkdir and trap.
+trap "rmdir '$lockdir'; exit 1" 1 2 15
+
+# Run the compile.
+"$@"
+ret=$?
+
+if test -f "$cofile"; then
+  mv "$cofile" "$ofile"
+elif test -f "${cofile}bj"; then
+  mv "${cofile}bj" "$ofile"
+fi
+
+rmdir "$lockdir"
+exit $ret
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

File diff suppressed because it is too large
+ 1526 - 0
build-aux/config.guess


File diff suppressed because it is too large
+ 1658 - 0
build-aux/config.sub


+ 589 - 0
build-aux/depcomp

@@ -0,0 +1,589 @@
+#! /bin/sh
+# depcomp - compile a program generating dependencies as side-effects
+
+scriptversion=2007-03-29.01
+
+# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007 Free Software
+# Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
+
+case $1 in
+  '')
+     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
+     exit 1;
+     ;;
+  -h | --h*)
+    cat <<\EOF
+Usage: depcomp [--help] [--version] PROGRAM [ARGS]
+
+Run PROGRAMS ARGS to compile a file, generating dependencies
+as side-effects.
+
+Environment variables:
+  depmode     Dependency tracking mode.
+  source      Source file read by `PROGRAMS ARGS'.
+  object      Object file output by `PROGRAMS ARGS'.
+  DEPDIR      directory where to store dependencies.
+  depfile     Dependency file to output.
+  tmpdepfile  Temporary file to use when outputing dependencies.
+  libtool     Whether libtool is used (yes/no).
+
+Report bugs to <bug-automake@gnu.org>.
+EOF
+    exit $?
+    ;;
+  -v | --v*)
+    echo "depcomp $scriptversion"
+    exit $?
+    ;;
+esac
+
+if test -z "$depmode" || test -z "$source" || test -z "$object"; then
+  echo "depcomp: Variables source, object and depmode must be set" 1>&2
+  exit 1
+fi
+
+# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
+depfile=${depfile-`echo "$object" |
+  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
+tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
+
+rm -f "$tmpdepfile"
+
+# Some modes work just like other modes, but use different flags.  We
+# parameterize here, but still list the modes in the big case below,
+# to make depend.m4 easier to write.  Note that we *cannot* use a case
+# here, because this file can only contain one case statement.
+if test "$depmode" = hp; then
+  # HP compiler uses -M and no extra arg.
+  gccflag=-M
+  depmode=gcc
+fi
+
+if test "$depmode" = dashXmstdout; then
+   # This is just like dashmstdout with a different argument.
+   dashmflag=-xM
+   depmode=dashmstdout
+fi
+
+case "$depmode" in
+gcc3)
+## gcc 3 implements dependency tracking that does exactly what
+## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
+## it if -MD -MP comes after the -MF stuff.  Hmm.
+## Unfortunately, FreeBSD c89 acceptance of flags depends upon
+## the command line argument order; so add the flags where they
+## appear in depend2.am.  Note that the slowdown incurred here
+## affects only configure: in makefiles, %FASTDEP% shortcuts this.
+  for arg
+  do
+    case $arg in
+    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
+    *)  set fnord "$@" "$arg" ;;
+    esac
+    shift # fnord
+    shift # $arg
+  done
+  "$@"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  mv "$tmpdepfile" "$depfile"
+  ;;
+
+gcc)
+## There are various ways to get dependency output from gcc.  Here's
+## why we pick this rather obscure method:
+## - Don't want to use -MD because we'd like the dependencies to end
+##   up in a subdir.  Having to rename by hand is ugly.
+##   (We might end up doing this anyway to support other compilers.)
+## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
+##   -MM, not -M (despite what the docs say).
+## - Using -M directly means running the compiler twice (even worse
+##   than renaming).
+  if test -z "$gccflag"; then
+    gccflag=-MD,
+  fi
+  "$@" -Wp,"$gccflag$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+## The second -e expression handles DOS-style file names with drive letters.
+  sed -e 's/^[^:]*: / /' \
+      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
+## This next piece of magic avoids the `deleted header file' problem.
+## The problem is that when a header file which appears in a .P file
+## is deleted, the dependency causes make to die (because there is
+## typically no way to rebuild the header).  We avoid this by adding
+## dummy dependencies for each header file.  Too bad gcc doesn't do
+## this for us directly.
+  tr ' ' '
+' < "$tmpdepfile" |
+## Some versions of gcc put a space before the `:'.  On the theory
+## that the space means something, we add a space to the output as
+## well.
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp)
+  # This case exists only to let depend.m4 do its work.  It works by
+  # looking at the text of this script.  This case will never be run,
+  # since it is checked for above.
+  exit 1
+  ;;
+
+sgi)
+  if test "$libtool" = yes; then
+    "$@" "-Wp,-MDupdate,$tmpdepfile"
+  else
+    "$@" -MDupdate "$tmpdepfile"
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+
+  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
+    echo "$object : \\" > "$depfile"
+
+    # Clip off the initial element (the dependent).  Don't try to be
+    # clever and replace this with sed code, as IRIX sed won't handle
+    # lines with more than a fixed number of characters (4096 in
+    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
+    # the IRIX cc adds comments like `#:fec' to the end of the
+    # dependency line.
+    tr ' ' '
+' < "$tmpdepfile" \
+    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
+    tr '
+' ' ' >> $depfile
+    echo >> $depfile
+
+    # The second pass generates a dummy entry for each header file.
+    tr ' ' '
+' < "$tmpdepfile" \
+   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
+   >> $depfile
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+aix)
+  # The C for AIX Compiler uses -M and outputs the dependencies
+  # in a .u file.  In older versions, this file always lives in the
+  # current directory.  Also, the AIX compiler puts `$object:' at the
+  # start of each line; $object doesn't have directory information.
+  # Version 6 uses the directory in both cases.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$base.u
+    tmpdepfile3=$dir.libs/$base.u
+    "$@" -Wc,-M
+  else
+    tmpdepfile1=$dir$base.u
+    tmpdepfile2=$dir$base.u
+    tmpdepfile3=$dir$base.u
+    "$@" -M
+  fi
+  stat=$?
+
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+    exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    # Each line is of the form `foo.o: dependent.h'.
+    # Do two passes, one to just change these to
+    # `$object: dependent.h' and one to simply `dependent.h:'.
+    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+    # That's a tab and a space in the [].
+    sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+  else
+    # The sourcefile does not contain any dependencies, so just
+    # store a dummy comment line, to avoid errors with the Makefile
+    # "include basename.Plo" scheme.
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile"
+  ;;
+
+icc)
+  # Intel's C compiler understands `-MD -MF file'.  However on
+  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
+  # ICC 7.0 will fill foo.d with something like
+  #    foo.o: sub/foo.c
+  #    foo.o: sub/foo.h
+  # which is wrong.  We want:
+  #    sub/foo.o: sub/foo.c
+  #    sub/foo.o: sub/foo.h
+  #    sub/foo.c:
+  #    sub/foo.h:
+  # ICC 7.1 will output
+  #    foo.o: sub/foo.c sub/foo.h
+  # and will wrap long lines using \ :
+  #    foo.o: sub/foo.c ... \
+  #     sub/foo.h ... \
+  #     ...
+
+  "$@" -MD -MF "$tmpdepfile"
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+    rm -f "$tmpdepfile"
+    exit $stat
+  fi
+  rm -f "$depfile"
+  # Each line is of the form `foo.o: dependent.h',
+  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
+  # Do two passes, one to just change these to
+  # `$object: dependent.h' and one to simply `dependent.h:'.
+  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
+  # Some versions of the HPUX 10.20 sed can't process this invocation
+  # correctly.  Breaking it into two sed invocations is a workaround.
+  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
+    sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+hp2)
+  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
+  # compilers, which have integrated preprocessors.  The correct option
+  # to use with these is +Maked; it writes dependencies to a file named
+  # 'foo.d', which lands next to the object file, wherever that
+  # happens to be.
+  # Much of this is similar to the tru64 case; see comments there.
+  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+  test "x$dir" = "x$object" && dir=
+  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+  if test "$libtool" = yes; then
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir.libs/$base.d
+    "$@" -Wc,+Maked
+  else
+    tmpdepfile1=$dir$base.d
+    tmpdepfile2=$dir$base.d
+    "$@" +Maked
+  fi
+  stat=$?
+  if test $stat -eq 0; then :
+  else
+     rm -f "$tmpdepfile1" "$tmpdepfile2"
+     exit $stat
+  fi
+
+  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
+  do
+    test -f "$tmpdepfile" && break
+  done
+  if test -f "$tmpdepfile"; then
+    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
+    # Add `dependent.h:' lines.
+    sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
+  else
+    echo "#dummy" > "$depfile"
+  fi
+  rm -f "$tmpdepfile" "$tmpdepfile2"
+  ;;
+
+tru64)
+   # The Tru64 compiler uses -MD to generate dependencies as a side
+   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
+   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
+   # dependencies in `foo.d' instead, so we check for that too.
+   # Subdirectories are respected.
+   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
+   test "x$dir" = "x$object" && dir=
+   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
+
+   if test "$libtool" = yes; then
+      # With Tru64 cc, shared objects can also be used to make a
+      # static library.  This mechanism is used in libtool 1.4 series to
+      # handle both shared and static libraries in a single compilation.
+      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
+      #
+      # With libtool 1.5 this exception was removed, and libtool now
+      # generates 2 separate objects for the 2 libraries.  These two
+      # compilations output dependencies in $dir.libs/$base.o.d and
+      # in $dir$base.o.d.  We have to check for both files, because
+      # one of the two compilations can be disabled.  We should prefer
+      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
+      # automatically cleaned when .libs/ is deleted, while ignoring
+      # the former would cause a distcleancheck panic.
+      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
+      tmpdepfile2=$dir$base.o.d          # libtool 1.5
+      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
+      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
+      "$@" -Wc,-MD
+   else
+      tmpdepfile1=$dir$base.o.d
+      tmpdepfile2=$dir$base.d
+      tmpdepfile3=$dir$base.d
+      tmpdepfile4=$dir$base.d
+      "$@" -MD
+   fi
+
+   stat=$?
+   if test $stat -eq 0; then :
+   else
+      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+      exit $stat
+   fi
+
+   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
+   do
+     test -f "$tmpdepfile" && break
+   done
+   if test -f "$tmpdepfile"; then
+      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
+      # That's a tab and a space in the [].
+      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
+   else
+      echo "#dummy" > "$depfile"
+   fi
+   rm -f "$tmpdepfile"
+   ;;
+
+#nosideeffect)
+  # This comment above is used by automake to tell side-effect
+  # dependency tracking mechanisms from slower ones.
+
+dashmstdout)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  test -z "$dashmflag" && dashmflag=-M
+  # Require at least two characters before searching for `:'
+  # in the target name.  This is to cope with DOS-style filenames:
+  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
+  "$@" $dashmflag |
+    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  tr ' ' '
+' < "$tmpdepfile" | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+dashXmstdout)
+  # This case only exists to satisfy depend.m4.  It is never actually
+  # run, as this mode is specially recognized in the preamble.
+  exit 1
+  ;;
+
+makedepend)
+  "$@" || exit $?
+  # Remove any Libtool call
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+  # X makedepend
+  shift
+  cleared=no
+  for arg in "$@"; do
+    case $cleared in
+    no)
+      set ""; shift
+      cleared=yes ;;
+    esac
+    case "$arg" in
+    -D*|-I*)
+      set fnord "$@" "$arg"; shift ;;
+    # Strip any option that makedepend may not understand.  Remove
+    # the object too, otherwise makedepend will parse it as a source file.
+    -*|$object)
+      ;;
+    *)
+      set fnord "$@" "$arg"; shift ;;
+    esac
+  done
+  obj_suffix="`echo $object | sed 's/^.*\././'`"
+  touch "$tmpdepfile"
+  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
+  rm -f "$depfile"
+  cat < "$tmpdepfile" > "$depfile"
+  sed '1,2d' "$tmpdepfile" | tr ' ' '
+' | \
+## Some versions of the HPUX 10.20 sed can't process this invocation
+## correctly.  Breaking it into two sed invocations is a workaround.
+    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile" "$tmpdepfile".bak
+  ;;
+
+cpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout.
+  "$@" || exit $?
+
+  # Remove the call to Libtool.
+  if test "$libtool" = yes; then
+    while test $1 != '--mode=compile'; do
+      shift
+    done
+    shift
+  fi
+
+  # Remove `-o $object'.
+  IFS=" "
+  for arg
+  do
+    case $arg in
+    -o)
+      shift
+      ;;
+    $object)
+      shift
+      ;;
+    *)
+      set fnord "$@" "$arg"
+      shift # fnord
+      shift # $arg
+      ;;
+    esac
+  done
+
+  "$@" -E |
+    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
+       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
+    sed '$ s: \\$::' > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  cat < "$tmpdepfile" >> "$depfile"
+  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+msvisualcpp)
+  # Important note: in order to support this mode, a compiler *must*
+  # always write the preprocessed file to stdout, regardless of -o,
+  # because we must use -o when running libtool.
+  "$@" || exit $?
+  IFS=" "
+  for arg
+  do
+    case "$arg" in
+    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
+	set fnord "$@"
+	shift
+	shift
+	;;
+    *)
+	set fnord "$@" "$arg"
+	shift
+	shift
+	;;
+    esac
+  done
+  "$@" -E |
+  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
+  rm -f "$depfile"
+  echo "$object : \\" > "$depfile"
+  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
+  echo "	" >> "$depfile"
+  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
+  rm -f "$tmpdepfile"
+  ;;
+
+none)
+  exec "$@"
+  ;;
+
+*)
+  echo "Unknown depmode $depmode" 1>&2
+  exit 1
+  ;;
+esac
+
+exit 0
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

+ 269 - 0
build-aux/install-sh

@@ -0,0 +1,269 @@
+#!/bin/sh
+#
+# install - install a program, script, or datafile
+#
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.  It can only install one file at a time, a restriction
+# shared with many OS's install programs.
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+	-c) instcmd="$cpprog"
+	    shift
+	    continue;;
+
+	-d) dir_arg=true
+	    shift
+	    continue;;
+
+	-m) chmodcmd="$chmodprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-o) chowncmd="$chownprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-g) chgrpcmd="$chgrpprog $2"
+	    shift
+	    shift
+	    continue;;
+
+	-s) stripcmd="$stripprog"
+	    shift
+	    continue;;
+
+	-t=*) transformarg=`echo $1 | sed 's/-t=//'`
+	    shift
+	    continue;;
+
+	-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+	    shift
+	    continue;;
+
+	*)  if [ x"$src" = x ]
+	    then
+		src=$1
+	    else
+		# this colon is to work around a 386BSD /bin/sh bug
+		:
+		dst=$1
+	    fi
+	    shift
+	    continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+	echo "install:	no input file specified"
+	exit 1
+else
+	true
+fi
+
+if [ x"$dir_arg" != x ]; then
+	dst=$src
+	src=""
+	
+	if [ -d $dst ]; then
+		instcmd=:
+		chmodcmd=""
+	else
+		instcmd=mkdir
+	fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+	if [ -f $src -o -d $src ]
+	then
+		true
+	else
+		echo "install:  $src does not exist"
+		exit 1
+	fi
+	
+	if [ x"$dst" = x ]
+	then
+		echo "install:	no destination specified"
+		exit 1
+	else
+		true
+	fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+	if [ -d $dst ]
+	then
+		dst="$dst"/`basename $src`
+	else
+		true
+	fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS=' 	
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+	pathcomp="${pathcomp}${1}"
+	shift
+
+	if [ ! -d "${pathcomp}" ] ;
+        then
+		$mkdirprog "${pathcomp}"
+	else
+		true
+	fi
+
+	pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+	$doit $instcmd $dst &&
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+	if [ x"$transformarg" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		dstfile=`basename $dst $transformbasename | 
+			sed $transformarg`$transformbasename
+	fi
+
+# don't allow the sed command to completely eliminate the filename
+
+	if [ x"$dstfile" = x ] 
+	then
+		dstfile=`basename $dst`
+	else
+		true
+	fi
+
+# Make a temp file name in the proper directory.
+
+	dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+	$doit $instcmd $src $dsttmp &&
+
+	trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+	if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+	if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+	if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+	if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+	$doit $rmcmd -f $dstdir/$dstfile &&
+	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0

File diff suppressed because it is too large
+ 6964 - 0
build-aux/ltmain.sh


+ 367 - 0
build-aux/missing

@@ -0,0 +1,367 @@
+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+
+scriptversion=2006-05-10.23
+
+# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
+#   Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+
+run=:
+sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
+sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
+
+# In the cases where this matters, `missing' is being run in the
+# srcdir already.
+if test -f configure.ac; then
+  configure_ac=configure.ac
+else
+  configure_ac=configure.in
+fi
+
+msg="missing on your system"
+
+case $1 in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  # Exit code 63 means version mismatch.  This often happens
+  # when the user try to use an ancient version of a tool on
+  # a file that requires a minimum version.  In this case we
+  # we should proceed has if the program had been absent, or
+  # if --run hadn't been passed.
+  if test $? = 63; then
+    run=:
+    msg="probably too old"
+  fi
+  ;;
+
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  autom4te     touch the output file, or create a stub one
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  help2man     touch the output file
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+
+Send bug reports to <bug-automake@gnu.org>."
+    exit $?
+    ;;
+
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing $scriptversion (GNU Automake)"
+    exit $?
+    ;;
+
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+
+esac
+
+# Now exit if we have it, but it failed.  Also exit now if we
+# don't have it and --version was passed (most likely to detect
+# the program).
+case $1 in
+  lex|yacc)
+    # Not GNU programs, they don't have --version.
+    ;;
+
+  tar)
+    if test -n "$run"; then
+       echo 1>&2 "ERROR: \`tar' requires --run"
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       exit 1
+    fi
+    ;;
+
+  *)
+    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
+       # We have it, but it failed.
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       # Could not run --version or --help.  This is probably someone
+       # running `$TOOL --version' or `$TOOL --help' to check whether
+       # $TOOL exists and not knowing $TOOL uses missing.
+       exit 1
+    fi
+    ;;
+esac
+
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case $1 in
+  aclocal*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`${configure_ac}'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case $f in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+
+  automake*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+
+  autom4te)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, but is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.
+         You can get \`$1' as part of \`Autoconf' from any GNU
+         archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo "#! /bin/sh"
+	echo "# Created by GNU Automake missing as a replacement of"
+	echo "#  $ $@"
+	echo "exit 0"
+	chmod +x $file
+	exit 1
+    fi
+    ;;
+
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' $msg.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f y.tab.h; then
+	echo >y.tab.h
+    fi
+    if test ! -f y.tab.c; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if test $# -ne 1; then
+        eval LASTARG="\${$#}"
+	case $LASTARG in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if test -f "$SRCFILE"; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if test ! -f lex.yy.c; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+
+  help2man)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+	 you modified a dependency of a manual page.  You may need the
+	 \`Help2man' package in order for those modifications to take
+	 effect.  You can get \`Help2man' from any GNU archive site."
+
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo ".ab help2man is required to generate this page"
+	exit 1
+    fi
+    ;;
+
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    # The file to touch is that specified with -o ...
+    file=`echo "$*" | sed -n "$sed_output"`
+    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
+    if test -z "$file"; then
+      # ... or it is the one specified with @setfilename ...
+      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '
+	/^@setfilename/{
+	  s/.* \([^ ]*\) *$/\1/
+	  p
+	  q
+	}' $infile`
+      # ... or it is derived from the source name (dir/f.texi becomes f.info)
+      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
+    fi
+    # If the file does not exist, the user really needs makeinfo;
+    # let's fail without touching anything.
+    test -f $file || exit 1
+    touch $file
+    ;;
+
+  tar)
+    shift
+
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar "$@" && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar "$@" && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case $firstarg in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+	case $firstarg in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+    fi
+
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequisites for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+
+exit 0
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

+ 444 - 0
configure.ac

@@ -0,0 +1,444 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+AC_INIT([StarPU],0.1, [http://runtime.bordeaux.inria.fr/StarPU/], starpu)
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([-Wall -Werror foreign])
+AC_MSG_NOTICE([Configuring StarPU])
+
+AC_CONFIG_SRCDIR(include/starpu.h)
+
+AC_PREREQ(2.60)
+
+
+AC_PROG_CC
+AC_PROG_CPP
+AC_PROG_SED
+AC_PROG_LN_S
+AC_PROG_F77
+
+AC_PROG_LIBTOOL
+
+AC_PROG_INSTALL
+AC_PROG_MKDIR_P
+AC_PROG_LN_S
+
+AC_HEADER_STDC
+
+# This will be useful for program which use CUDA (and .cubin files) which need
+# some path to the CUDA code at runtime.
+AC_DEFINE_UNQUOTED(STARPUDIR, "$PWD", [location of StarPU sources])
+AC_SUBST(STARPUDIR, $PWD)
+
+AC_SEARCH_LIBS([pthread_create],[pthread],,AC_MSG_ERROR([pthread library unavailable]))
+AC_SEARCH_LIBS([sqrt],[m],,AC_MSG_ERROR([math library unavailable]))
+
+###############################################################################
+#                                                                             #
+#                                 CPUs settings                               #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(whether CPUs should be used)
+AC_ARG_ENABLE(cpu, [AS_HELP_STRING([--disable-cpu],
+			[do not use the CPU(s)])],
+			enable_cpu=$enableval, enable_cpu=yes)
+AC_MSG_RESULT($enable_cpu)
+AC_SUBST(USE_CPU, $enable_cpu)
+AM_CONDITIONAL(USE_CPU, test x$enable_cpu = xyes)
+
+if test x$enable_cpu = xyes; then
+	AC_DEFINE(USE_CPUS, [1], [CPU driver is activated])
+
+	# This value is set quite randomly, but StarPU should not take more
+	# core than there are in the system
+	AC_DEFINE(NMAXCORES, [16], [Maximum number of CPUs supported])
+fi
+
+###############################################################################
+#                                                                             #
+#                                 CUDA settings                               #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(cuda, [AS_HELP_STRING([--disable-cuda],
+		[do not use CUDA device(s)])],, [enable_cuda=maybe])
+
+if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
+	#AC_MSG_CHECKING(whether CUDA is available)
+	AC_ARG_WITH(cuda-dir, 
+		[AS_HELP_STRING([--with-cuda-dir=<path>],
+		[specify CUDA installation directory (default is /usr/local/cuda/)])],
+		[
+			cuda_dir=$withval
+			# in case this was not explicit yet
+			enable_cuda=yes
+		], cuda_dir=/usr/local/cuda/)
+	
+	if test -d "$cuda_dir/include/"; then
+		CPPFLAGS="${CPPFLAGS} -I$cuda_dir/include/ "
+	fi
+	if test -d "$cuda_dir/lib/"; then
+		LDFLAGS="${LDFLAGS} -L$cuda_dir/lib/ "
+	fi
+
+	# do we have a valid CUDA setup ?
+	have_valid_cuda=yes
+	AC_CHECK_HEADER([cuda.h],,[have_valid_cuda=no])
+	AC_SEARCH_LIBS([cuInit],[cuda],,[have_valid_cuda=no])
+
+	# we also check that CUBLAS is available
+	AC_SEARCH_LIBS([cublasInit],[cublas],,[have_valid_cuda=no])
+
+	# in case CUDA was explicitely required, but is not available, this is an error
+	if test x$enable_cuda = xyes -a x$have_valid_cuda = no; then
+		AC_MSG_ERROR([cannot find CUDA])
+	fi
+
+	# now we enable CUDA if and only if a proper setup is available
+	enable_cuda=$have_valid_cuda
+fi
+
+AC_MSG_CHECKING(whether CUDA should be used)
+AC_MSG_RESULT($enable_cuda)
+AC_SUBST(USE_CUDA, $enable_cuda)
+AM_CONDITIONAL(USE_CUDA, test x$enable_cuda = xyes)
+if test x$enable_cuda = xyes; then
+	AC_DEFINE(USE_CUDA, [1], [CUDA support is activated])
+
+	#in case this is a 64bit setup, we tell nvcc to use a -m64 flag
+	AC_CHECK_SIZEOF([void *])
+	if test x$SIZEOF_VOID_P = x8; then
+		NVCCFLAGS="${NVCCFLAGS} -m64"
+		AC_SUBST(NVCCFLAGS)
+	fi
+fi
+
+###############################################################################
+#                                                                             #
+#                                 Cell settings                               #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(whether GORDON should be used)
+AC_ARG_ENABLE(gordon, [AS_HELP_STRING([--enable-gordon],
+			[use Cell's SPUs])],
+			enable_gordon=$enableval, enable_gordon=no)
+AC_MSG_RESULT($enable_gordon)
+AC_SUBST(USE_GORDON, $enable_gordon)
+AM_CONDITIONAL(USE_GORDON, test x$enable_gordon = xyes)
+
+if test x$enable_gordon = xyes; then
+	AC_CHECK_LIB(spe2, spe_context_create,,AC_MSG_ERROR([cannot find libspe2]))
+	AC_DEFINE(USE_GORDON, [1], [Cell support is enabled])
+fi
+
+###############################################################################
+#                                                                             #
+#                   Debug and Performance analysis tools                      #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(whether debug messages should be displayed)
+AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
+			[display verbose debug messages])],
+			enable_verbose=$enableval, enable_verbose=no)
+AC_MSG_RESULT($enable_verbose)
+if test x$enable_verbose = xyes; then
+	AC_DEFINE(VERBOSE, [1], [display verbose debug messages])
+fi
+
+
+AC_MSG_CHECKING(whether coverage testing should be enabled)
+AC_ARG_ENABLE(coverage, [AS_HELP_STRING([--enable-coverage],
+			[enable coverage checking])],
+			enable_coverage=$enableval, enable_coverage=no)
+AC_MSG_RESULT($enable_coverage)
+AC_SUBST(COVERAGE, $enable_coverage)
+if test x$enable_coverage = xyes; then
+	CPPFLAGS="${CPPFLAGS} --coverage"
+	LDFLAGS="${LDFLAGS} --coverage"
+fi
+
+
+# shall we use FxT to generate trace of the execution ?
+AC_MSG_CHECKING(whether fxt traces should be generated)
+AC_ARG_WITH(fxt, [AS_HELP_STRING([--with-fxt[=<dir>]], [generate fxt traces])],
+	[
+		if test x$withval != xno; then
+			use_fxt=yes
+			if test x$withval = xyes; then
+				AC_MSG_RESULT(yes)
+				use_fxt_from_system=yes
+			else
+				# use specified path
+				# TODO check if the dir is actually containing FxT
+				use_fxt_from_system=no
+				fxtdir=$withval
+				AC_MSG_RESULT(yes using $fxtdir)
+				AC_SUBST(FXTDIR, $fxtdir)
+			fi
+		else
+			use_fxt=no
+			AC_MSG_RESULT(no)
+		fi
+	],
+	[	
+		use_fxt=no
+		AC_MSG_RESULT(no)
+	])
+AC_SUBST(USE_FXT, $use_fxt)
+AM_CONDITIONAL(USE_FXT, test x$use_fxt = xyes)
+
+if test x$use_fxt = xyes; then
+	AC_DEFINE(USE_FXT, [1], [enable FxT traces])
+	AC_DEFINE(CONFIG_FUT, [1], [enable FUT traces])
+
+	AC_SUBST(USE_FXTDIR_FROM_USER,$use_fxt_from_system)
+	if test x$use_fxt_from_system = xno; then
+		CPPFLAGS="${CPPFLAGS} -I$fxtdir/include/ "
+		LDFLAGS="${LDFLAGS} -L$fxtdir/lib/ "
+	fi
+
+	AC_CHECK_LIB(fxt, fut_setup,,AC_MSG_ERROR([cannot find fxt lib]))
+	AC_CHECK_HEADER([fxt/fxt.h],,AC_MSG_ERROR([cannot find headers for fxt]))
+	AC_CHECK_HEADER([fxt/fut.h],,AC_MSG_ERROR([cannot find headers for fxt]))
+
+	# In case FxT traces are generated, we may use our (poor) hand-made gtk
+	# tool to visualize traces
+
+	PKG_PROG_PKG_CONFIG
+	PKG_CHECK_MODULES([GTK], [gtk+-2.0], enable_gtk=yes, enable_gtk=no) 
+	AC_SUBST(USE_GTK, $enable_gtk)
+	if test x$enable_gtk = xyes; then
+		AC_SUBST(GTK_CFLAGS)
+		AC_SUBST(GTK_LIBS)
+		AC_DEFINE(USE_GTK, [1], [enable GTK])
+	fi
+fi
+
+AC_MSG_CHECKING(whether performance debugging should be enabled)
+AC_ARG_ENABLE(perf-debug, [AS_HELP_STRING([--enable-perf-debug],
+			[enable performance debugging])],
+			enable_perf_debug=$enableval, enable_perf_debug=no)
+AC_MSG_RESULT($enable_perf_debug)
+AC_SUBST(PERF_DEBUG, $enable_perf_debug)
+if test x$enable_perf_debuf = xyes; then
+	AC_DEFINE(PERF_DEBUG, [1], [enable performance debug])
+	CPPFLAGS="${CPPFLAGS} -pg "
+	LDFLAGS="${LDFLAGS} -pg "
+fi
+
+AC_MSG_CHECKING(whether performance model debugging should be enabled)
+AC_ARG_ENABLE(model-debug, [AS_HELP_STRING([--enable-model-debug],
+			[enable performance model debugging])],
+			enable_model_debug=$enableval, enable_model_debug=no)
+AC_MSG_RESULT($enable_model_debug)
+if  test x$enable_model_debug = xyes; then
+	AC_DEFINE(MODEL_DEBUG, [1], [enable performance model debug])
+fi
+
+AC_MSG_CHECKING(whether statistics should be generated)
+AC_ARG_ENABLE(stats, [AS_HELP_STRING([--enable-stats],
+			[enable statistics])],
+			enable_stats=$enableval, enable_stats=no)
+AC_MSG_RESULT($enable_stats)
+AC_SUBST(STATS, $enable_stats)
+AC_SUBST(DATA_STATS, $enable_stats)
+
+if test x$enable_stats = xyes; then
+        AC_DEFINE(DATA_STATS, [1], [enable statistics])
+fi
+
+
+###############################################################################
+#                                                                             #
+#                  Miscellaneous options for StarPU                           #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(whether a dynamic library should be generated)
+AC_ARG_ENABLE(dynamic, [AS_HELP_STRING([--enable-dynamic],
+			[generate a dynamic library])],
+			enable_dynamic=$enableval, enable_dynamic=no)
+AC_MSG_RESULT($enable_dynamic)
+AC_SUBST(DYNAMIC, $enable_dynamic)
+
+AC_MSG_CHECKING(whether priorities should be enabled)
+AC_ARG_ENABLE(priority, [AS_HELP_STRING([--disable-priority],
+			[do not use priorities])],
+			enable_priority=$enableval, enable_priority=yes)
+AC_MSG_RESULT($enable_priority)
+if test x$enable_priority = xno; then
+	AC_DEFINE(NO_PRIO, [1], [Disable priorities])
+fi
+
+AC_MSG_CHECKING(whether data RW-lock should be used)
+AC_ARG_ENABLE(data-rw-lock, [AS_HELP_STRING([--disable-data-rw-lock],
+			[do not use data RW-locks])],
+			enable_data_rw_lock=$enableval, enable_data_rw_lock=yes)
+AC_MSG_RESULT($enable_data_rw_lock)
+if test x$enable_data_rw_lock = xno; then
+	AC_DEFINE(NO_DATA_RW_LOCK, [1], [data RW-lock are disabled])
+fi
+
+
+AC_MSG_CHECKING(whether allocation cache should be used)
+AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--enable-allocation-cache],
+			[enable data allocation cache])],
+			enable_allocation_cache=$enableval, enable_allocation_cache=no)
+AC_MSG_RESULT($enable_allocation_cache)
+if test x$enable_allocation_cache = xyes; then
+	AC_DEFINE(USE_ALLOCATION_CACHE, [1], [enable data allocation cache])
+fi
+
+# by default, we put the performance models in $PWD/.sampling/
+perf_model_dir=($PWD/.sampling/)
+AC_ARG_WITH(perf-model-dir, [AS_HELP_STRING([--with-perf-model-dir=<dir>], [specify where performance models shoulds be stored])],
+	[
+		if x$withval != x$no; then
+			perf_model_dir=$withval
+		fi
+	]
+	)
+AC_MSG_CHECKING(performance models location)
+AC_MSG_RESULT($perf_model_dir)
+AC_DEFINE_UNQUOTED(PERF_MODEL_DIR, "$perf_model_dir", [performance models location])
+
+###############################################################################
+#                                                                             #
+#                                  Examples                                   #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(opengl-render, [AS_HELP_STRING([--enable-opengl-render],
+			[enable OpenGL rendering of some examples])],
+			enable_opengl_render=$enableval, enable_opengl_render=no)
+
+if test x$enable_opengl_render = xyes; then
+	AC_CHECK_LIB(glut, glutInit,,AC_MSG_ERROR([cannot find glut]))
+	AC_CHECK_LIB(GL, glXCreateContext,,AC_MSG_ERROR([cannot find GL]))
+	AC_CHECK_LIB(GLU, gluLookAt,,AC_MSG_ERROR([cannot find GLU]))
+	
+	AC_DEFINE(OPENGL_RENDER, [1], [enable OpenGL rendering of some examples])
+fi
+
+AC_MSG_CHECKING(whether OpenGL rendering is enabled)
+AC_SUBST(OPENGL_RENDER, $enable_opengl_render)
+AC_MSG_RESULT($enable_opengl_render)
+
+# In case there are BLAS kernels that are used by the example applications
+# we may specify which library to use. Note that this is not used for StarPU
+# itself.
+
+blas_lib=maybe
+AC_ARG_ENABLE(blas-lib,
+ [  --enable-blas-lib[=blaslibname]:
+                      none [default]: no BLAS lib is used
+                      atlas: use ATLAS library
+                      goto: use GOTO library],
+ [ 
+     if   test "x$enableval" = "xatlas" ; then
+        blas_lib=atlas
+     elif test "x$enableval" = "xgoto" ; then
+        blas_lib=goto
+     elif test "x$enableval" = "xnone" ; then
+        blas_lib=none
+     elif test x$enableval = xno; then
+	blas_lib=none
+     else
+        echo
+        echo "Error!"
+        echo "Unknown BLAS library"
+        exit -1
+     fi
+ ])
+
+if test x$blas_lib = xmaybe -o x$blas_lib = xgoto; then
+AC_ARG_WITH(goto-dir, [AS_HELP_STRING([--with-goto-dir=<dir>], [specify GOTO lib location])],
+	[
+		blas_lib=goto
+		gotodir=$withval
+		AC_SUBST(GOTODIR, $gotodir)
+
+		CPPFLAGS="${CPPFLAGS} -I$gotodir/ "
+		LDFLAGS="${LDFLAGS} -L$gotodir/ "
+	]
+	)
+
+if test x$blas_lib = xgoto; then
+# test whether ATLAS is actually available
+AC_CHECK_LIB(goto, sgemm_,,AC_MSG_ERROR([cannot find goto lib]))
+AC_DEFINE(GOTO, [1], [use GOTO library])
+fi
+
+fi
+     
+if test x$blas_lib = xmaybe -o x$blas_lib = xatlas; then
+AC_ARG_WITH(atlas-dir, [AS_HELP_STRING([--with-atlas-dir=<dir>], [specify ATLAS lib location])],
+	[
+		AC_MSG_CHECKING(ATLAS location)
+		blas_lib=atlas
+		atlasdir=$withval
+		AC_MSG_RESULT($atlasdir)
+		AC_SUBST(ATLASDIR, $atlasdir)
+
+		CPPFLAGS="${CPPFLAGS} -I$atlasdir/include/ "
+		LDFLAGS="${LDFLAGS} -L$atlasdir/lib/ "
+	]
+	)
+
+if test x$blas_lib = xatlas; then
+# test whether ATLAS is actually available
+AC_CHECK_HEADER([cblas.h],,AC_MSG_ERROR([cannot find atlas headers]))
+AC_CHECK_LIB(atlas, ATL_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),)
+AC_CHECK_LIB(cblas, cblas_sgemm,,AC_MSG_ERROR([cannot find atlas lib]),[-latlas])
+AC_DEFINE(ATLAS, [1], [use ATLAS library])
+fi
+
+fi
+ 
+if test x$blas_lib = xmaybe; then
+     #perhaps it is possible to use some BLAS lib from the system
+     use_system_blas=no
+     AC_SEARCH_LIBS([sgemm_],[blas],use_system_blas=yes,,)
+     if test x$use_system_blas = xyes; then
+        AC_DEFINE(SYSTEM_BLAS, [1], [use refblas library])
+	blas_lib=system
+     else
+	blas_lib=none
+     fi
+fi
+
+AM_CONDITIONAL(ATLAS_BLAS_LIB, test x$blas_lib = xatlas)
+AM_CONDITIONAL(GOTO_BLAS_LIB, test x$blas_lib = xgoto)
+AM_CONDITIONAL(SYSTEM_BLAS_LIB, test x$blas_lib = xsystem)
+AM_CONDITIONAL(NO_BLAS_LIB, test x$blas_lib = xnone)
+
+AC_MSG_CHECKING(which BLAS lib should be used)
+AC_MSG_RESULT($blas_lib)
+AC_SUBST(BLAS_LIB,$blas_lib)
+
+AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
+
+AC_OUTPUT([
+	Makefile
+	src/Makefile
+	tools/Makefile
+	libstarpu.pc
+	examples/Makefile
+	tests/Makefile
+])

+ 214 - 0
examples/Makefile.am

@@ -0,0 +1,214 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+LIBS = $(top_builddir)/src/libstarpu.la @LIBS@
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/
+
+EXTRA_DIST = 					\
+	cuda/incrementer_cuda.cu		\
+	cuda/spmv_cuda.cu
+
+CLEANFILES = 					\
+	incrementer_cuda.linkinfo		\
+	cuda/incrementer_cuda.cubin		\
+	spmv_cuda.linkinfo			\
+	cuda/spmv_cuda.cubin			
+
+
+if USE_CUDA
+
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+
+.cu.cubin:
+	$(MKDIR_P) `dirname $@`
+	$(NVCC) -cubin $< -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+
+BUILT_SOURCES =					\
+	cuda/incrementer_cuda.cubin		\
+	cuda/spmv_cuda.cubin			
+
+endif
+
+noinst_PROGRAMS =
+
+noinst_HEADERS = 				\
+	heat/lu_kernels_model.h			\
+	heat/dw_sparse_cg.h			\
+	heat/heat.h				\
+	heat/dw_factolu.h			\
+	cholesky/dw_cholesky_models.h		\
+	cholesky/dw_cholesky.h			\
+	common/blas_model.h			\
+	common/blas.h				\
+	mult/dw_mult.h				\
+	cuda/incrementer_cuda.h			\
+	fortran/bindings/StarPU-fortran.h	\
+	strassen/strassen.h			\
+	strassen/strassen_models.h		\
+	spmv/matrix-market/mmio.h		\
+	spmv/matrix-market/mm_to_bcsr.h		\
+	spmv/dw_spmv.h				\
+	spmv/dw_block_spmv.h
+
+################
+# Mult example #
+################
+
+if !NO_BLAS_LIB
+
+noinst_PROGRAMS += 				\
+	mult/dw_mult 				\
+	mult/dw_mult_no_stride			\
+	mult/dw_mult_no_stride_no_tag 
+
+mult_dw_mult_SOURCES = 				\
+	mult/dw_mult.c				\
+	common/blas.c				\
+	common/blas_model.c
+		
+mult_dw_mult_no_stride_SOURCES = 		\
+	mult/dw_mult_no_stride.c		\
+	common/blas.c				\
+	common/blas_model.c
+
+mult_dw_mult_no_stride_no_tag_SOURCES =		\
+	mult/dw_mult_no_stride_no_tag.c		\
+	common/blas.c				\
+	common/blas_model.c
+
+endif
+
+####################
+# Cholesky example #
+####################
+
+if !NO_BLAS_LIB
+
+noinst_PROGRAMS +=				\
+	cholesky/dw_cholesky			\
+	cholesky/dw_cholesky_no_stride
+
+cholesky_dw_cholesky_SOURCES =			\
+	cholesky/dw_cholesky.c			\
+	cholesky/dw_cholesky_models.c		\
+	cholesky/dw_cholesky_kernels.c		\
+	common/blas.c
+
+cholesky_dw_cholesky_no_stride_SOURCES =	\
+	cholesky/dw_cholesky_no_stride.c	\
+	cholesky/dw_cholesky_models.c		\
+	cholesky/dw_cholesky_kernels.c		\
+	common/blas.c
+
+endif
+
+################
+# Heat example #
+################
+
+if !NO_BLAS_LIB
+
+noinst_PROGRAMS += heat/heat
+
+heat_heat_SOURCES =				\
+	heat/heat.c				\
+	heat/dw_factolu.c			\
+	heat/dw_factolu_tag.c			\
+	heat/dw_sparse_cg.c			\
+	heat/heat_display.c			\
+	heat/lu_kernels_model.c			\
+	heat/dw_sparse_cg_kernels.c		\
+	heat/dw_factolu_kernels.c		\
+	common/blas.c
+
+endif
+
+################
+# Tag examples #
+################
+
+noinst_PROGRAMS +=				\
+	tag_example/tag_example			\
+	tag_example/tag_example2
+
+tag_example_tag_example_SOURCES =		\
+	tag_example/tag_example.c
+
+tag_example_tag_example2_SOURCES =		\
+	tag_example/tag_example2.c
+
+####################
+# Strassen example #
+####################
+
+if ATLAS_BLAS_LIB 
+
+noinst_PROGRAMS += strassen/dw_strassen
+
+strassen_dw_strassen_SOURCES = 			\
+	strassen/strassen.c			\
+	strassen/strassen_kernels.c		\
+	strassen/test_strassen.c		\
+	strassen/strassen_models.c		\
+	common/blas.c
+
+endif
+
+#####################
+# Strassen2 example #
+#####################
+
+if !NO_BLAS_LIB 
+
+noinst_PROGRAMS += strassen2/strassen
+
+strassen2_strassen_SOURCES = 			\
+	strassen2/strassen2.c			\
+	strassen2/strassen2_kernels.c		\
+	common/blas.c
+
+endif
+
+################
+# SpMV example #
+################
+
+if ATLAS_BLAS_LIB 
+
+noinst_PROGRAMS += 				\
+	spmv/dw_spmv				\
+	spmv/dw_block_spmv
+
+spmv_dw_spmv_SOURCES = 				\
+	spmv/dw_spmv.c
+
+spmv_dw_block_spmv_SOURCES =			\
+	spmv/dw_block_spmv.c			\
+	spmv/dw_block_spmv_kernels.c		\
+	spmv/matrix-market/mm_to_bcsr.c		\
+	spmv/matrix-market/mmio.c
+
+endif
+
+#######################
+# Incrementer example #
+#######################
+
+noinst_PROGRAMS += incrementer/incrementer
+
+incrementer_incrementer_SOURCES =		\
+	incrementer/incrementer.c

+ 46 - 0
examples/cholesky/Makefile.in

@@ -0,0 +1,46 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+LDFLAGS+= ../common/blas.o
+
+all: dw_cholesky dw_cholesky_no_stride
+
+dw_cholesky_models.o: dw_cholesky_models.h dw_cholesky_models.c
+	$(CC) $(CFLAGS) dw_cholesky_models.c -c -o dw_cholesky_models.o
+
+dw_cholesky_kernels.o: dw_cholesky_kernels.c
+	$(CC) $(CFLAGS) dw_cholesky_kernels.c -c -o dw_cholesky_kernels.o
+
+dw_cholesky_no_stride.o: dw_cholesky_no_stride.c
+	$(CC) $(CFLAGS) dw_cholesky_no_stride.c -c -o dw_cholesky_no_stride.o
+
+dw_cholesky.o: dw_cholesky.c
+	$(CC) $(CFLAGS) dw_cholesky.c -c -o dw_cholesky.o
+
+dw_cholesky_no_stride:  $(STARPU) dw_cholesky_no_stride.o
+	$(CC) dw_cholesky_no_stride.o dw_cholesky_kernels.o dw_cholesky_models.o -o dw_cholesky_no_stride $(LDFLAGS) $(LIBS)
+
+dw_cholesky: $(STARPU) dw_cholesky.o dw_cholesky_kernels.o dw_cholesky_models.o
+	$(CC)  dw_cholesky.o dw_cholesky_kernels.o dw_cholesky_models.o -o dw_cholesky $(LDFLAGS) $(LIBS)
+
+clean:
+	@rm -f *.o *.d gmon.out *.gcno *.gcda
+	@rm -f dw_cholesky dw_cholesky_no_stride

+ 357 - 0
examples/cholesky/dw_cholesky.c

@@ -0,0 +1,357 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_cholesky.h"
+#include "dw_cholesky_models.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+static void terminal_callback(void *argcb)
+{
+	sem_t *sem = argcb;
+	sem_post(sem);
+}
+
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned nblocks, sem_t *sem)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11(k));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = RW;
+
+	/* this is an important task */
+	task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	/* the very last task must be notified */
+	if (k == nblocks - 1) {
+		task->callback_func = terminal_callback;
+		task->callback_arg = sem;
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = RW;
+
+	if (j == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl22 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, i); 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = R;
+	task->buffers[2].state = get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	starpu_submit_task(task);
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void _dw_cholesky(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	/* create a new codelet */
+	sem_t sem;
+	sem_init(&sem, 0, 0U);
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, nblocks, &sem);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	starpu_submit_task(entry_task);
+
+	/* stall the application until the end of computations */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+
+	double flop = (1.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	starpu_init();
+
+	timing_init();
+
+	if (pinned)
+	{
+		starpu_malloc_pinned_if_possible(A, dim*dim*sizeof(float));
+	} 
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_monitor_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	_dw_cholesky(dataA, nblocks);
+
+	starpu_unpartition_data(dataA, 0);
+
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+
+	mat = malloc(size*size*sizeof(float));
+	initialize_system(&mat, size, pinned);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	printf("Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+
+	dw_cholesky(mat, size, size, nblocks);
+
+#ifdef CHECK_OUTPUT
+	printf("Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+				mat[j+i*size] = 0.0f; // debug
+			}
+		}
+		printf("\n");
+	}
+
+	fprintf(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f, 
+				mat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", test_mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+	return 0;
+}

+ 101 - 0
examples/cholesky/dw_cholesky.h

@@ -0,0 +1,101 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_H__
+#define __DW_CHOLESKY_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+#include "../common/blas.h"
+#include <starpu.h>
+
+#define NMAXBLOCKS	32
+
+#define TAG11(k)	( (1ULL<<60) | (unsigned long long)(k))
+#define TAG21(k,j)	(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j)))
+#define TAG22(k,i,j)	(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j)))
+
+#define BLOCKSIZE	(size/nblocks)
+
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+typedef struct {
+	starpu_data_handle dataA;
+	unsigned i;
+	unsigned j;
+	unsigned k;
+	unsigned nblocks;
+	unsigned *remaining;
+	sem_t *sem;
+} cl_args;
+
+static unsigned size = 4*1024;
+static unsigned nblocks = 4;
+static unsigned pinned = 0;
+
+void chol_core_codelet_update_u11(starpu_data_interface_t *, void *);
+void chol_core_codelet_update_u21(starpu_data_interface_t *, void *);
+void chol_core_codelet_update_u22(starpu_data_interface_t *, void *);
+
+#ifdef USE_CUDA
+void chol_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args);
+void chol_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args);
+void chol_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args);
+#endif
+
+void initialize_system(float **A, unsigned dim, unsigned pinned);
+void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks);
+
+extern struct starpu_perfmodel_t chol_model_11;
+extern struct starpu_perfmodel_t chol_model_21;
+extern struct starpu_perfmodel_t chol_model_22;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+		        char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-pin") == 0) {
+			pinned = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-pin] [-size size] [-nblocks nblocks]\n", argv[0]);
+		}
+	}
+}
+
+#endif // __DW_CHOLESKY_H__

+ 195 - 0
examples/cholesky/dw_cholesky_kernels.c

@@ -0,0 +1,195 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_cholesky.h"
+#include "../common/blas.h"
+
+/*
+ *   U22 
+ */
+
+static inline void chol_common_core_codelet_update_u22(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
+{
+	//printf("22\n");
+	float *left 	= (float *)buffers[0].blas.ptr;
+	float *right 	= (float *)buffers[1].blas.ptr;
+	float *center 	= (float *)buffers[2].blas.ptr;
+
+	unsigned dx = buffers[2].blas.ny;
+	unsigned dy = buffers[2].blas.nx;
+	unsigned dz = buffers[0].blas.ny;
+
+	unsigned ld21 = buffers[0].blas.ld;
+	unsigned ld12 = buffers[1].blas.ld;
+	unsigned ld22 = buffers[2].blas.ld;
+
+	switch (s) {
+		case 0:
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+				right, ld12, 1.0f, center, ld22);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasSgemm('n', 't', dy, dx, dz, 
+					-1.0f, left, ld21, right, ld12, 
+					 1.0f, center, ld22);
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void chol_core_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
+{
+	chol_common_core_codelet_update_u22(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void chol_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
+{
+	chol_common_core_codelet_update_u22(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+/* 
+ * U21
+ */
+
+static inline void chol_common_codelet_update_u21(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
+{
+//	printf("21\n");
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)buffers[0].blas.ptr;
+	sub21 = (float *)buffers[1].blas.ptr;
+
+	unsigned ld11 = buffers[0].blas.ld;
+	unsigned ld21 = buffers[1].blas.ld;
+
+	unsigned nx21 = buffers[1].blas.ny;
+	unsigned ny21 = buffers[1].blas.nx;
+
+	switch (s) {
+		case 0:
+			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void chol_core_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
+{
+	 chol_common_codelet_update_u21(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void chol_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
+{
+	chol_common_codelet_update_u21(descr, 1, _args);
+}
+#endif 
+
+/*
+ *	U11
+ */
+
+static inline void chol_common_codelet_update_u11(starpu_data_interface_t *descr, int s, __attribute__((unused)) void *_args) 
+{
+//	printf("11\n");
+	float *sub11;
+
+	sub11 = (float *)descr[0].blas.ptr; 
+
+	unsigned nx = descr[0].blas.ny;
+	unsigned ld = descr[0].blas.ld;
+
+	unsigned z;
+
+	switch (s) {
+		case 0:
+
+			/*
+			 *	- alpha 11 <- lambda 11 = sqrt(alpha11)
+			 *	- alpha 21 <- l 21	= alpha 21 / lambda 11
+			 *	- A22 <- A22 - l21 trans(l21)
+			 */
+
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				lambda11 = sqrt(sub11[z+z*ld]);
+				sub11[z+z*ld] = lambda11;
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+		
+				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+		
+				SSYR("L", nx - z - 1, -1.0f, 
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				float lambda11;
+				/* ok that's dirty and ridiculous ... */
+				cublasGetVector(1, sizeof(float), &sub11[z+z*ld], sizeof(float), &lambda11, sizeof(float));
+
+				lambda11 = sqrt(lambda11);
+
+				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+
+				STARPU_ASSERT(lambda11 != 0.0f);
+				
+				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+
+				cublasSsyr('U', nx - z - 1, -1.0f,
+							&sub11[(z+1)+z*ld], 1,
+							&sub11[(z+1)+(z+1)*ld], ld);
+			}
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+
+}
+
+
+void chol_core_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
+{
+	chol_common_codelet_update_u11(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void chol_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
+{
+	chol_common_codelet_update_u11(descr, 1, _args);
+}
+#endif// USE_CUDA

+ 152 - 0
examples/cholesky/dw_cholesky_models.c

@@ -0,0 +1,152 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_cholesky_models.h"
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+//#define USE_PERTURBATION	1
+
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+static double core_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
+
+#ifdef MODEL_DEBUG
+	printf("core_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
+
+#ifdef MODEL_DEBUG
+	printf("cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double core_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
+
+#ifdef MODEL_DEBUG
+	printf("core_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
+
+#ifdef MODEL_DEBUG
+	printf("cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double core_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
+
+#ifdef MODEL_DEBUG
+	printf("core_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_chol_task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
+
+#ifdef MODEL_DEBUG
+	printf("cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel_t chol_model_11 = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = core_chol_task_11_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_11_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "chol_model_11"
+};
+
+struct starpu_perfmodel_t chol_model_21 = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = core_chol_task_21_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_21_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "chol_model_21"
+};
+
+struct starpu_perfmodel_t chol_model_22 = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = core_chol_task_22_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_chol_task_22_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "chol_model_22"
+};

+ 22 - 0
examples/cholesky/dw_cholesky_models.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_CHOLESKY_MODELS_H__
+#define __DW_CHOLESKY_MODELS_H__
+
+#include <starpu.h>
+
+#endif // __DW_CHOLESKY_MODELS_H__

+ 306 - 0
examples/cholesky/dw_cholesky_no_stride.c

@@ -0,0 +1,306 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_cholesky.h"
+#include "dw_cholesky_models.h"
+
+/* A [ y ] [ x ] */
+float *A[NMAXBLOCKS][NMAXBLOCKS];
+starpu_data_handle A_state[NMAXBLOCKS][NMAXBLOCKS];
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+static void terminal_callback(void *argcb)
+{
+	sem_t *sem = argcb;
+	sem_post(sem);
+}
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u11,
+#endif
+#ifdef USE_GORDON
+	.gordon_func = SPU_FUNC_POTRF,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(unsigned k, unsigned nblocks, sem_t *sem)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11(k));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = A_state[k][k];
+	task->buffers[0].mode = RW;
+
+	/* this is an important task */
+	task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	/* the very last task must be notified */
+	if (k == nblocks - 1) {
+		task->callback_func = terminal_callback;
+		task->callback_arg = sem;
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u21,
+#endif
+#ifdef USE_GORDON
+	.gordon_func = SPU_FUNC_STRSM,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = A_state[k][k]; 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = A_state[j][k]; 
+	task->buffers[1].mode = RW;
+
+	if (j == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl22 =
+{
+	.where = ANY,
+	.core_func = chol_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cublas_func = chol_cublas_codelet_update_u22,
+#endif
+#ifdef USE_GORDON
+	.gordon_func = SPU_FUNC_SGEMM,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = A_state[i][k]; 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = A_state[j][k]; 
+	task->buffers[1].mode = R;
+	task->buffers[2].state = A_state[j][i]; 
+	task->buffers[2].mode = RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+	}
+
+	starpu_submit_task(task);
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void dw_cholesky_no_stride(void)
+{
+	struct timeval start;
+	struct timeval end;
+
+	/* create a new codelet */
+	sem_t sem;
+	sem_init(&sem, 0, 0U);
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(k, nblocks, &sem);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(k, j);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	starpu_submit_task(entry_task);
+
+	/* stall the application until the end of computations */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+int main(int argc, char **argv)
+{
+	unsigned x, y;
+	unsigned i, j;
+
+	parse_args(argc, argv);
+	assert(nblocks <= NMAXBLOCKS);
+
+	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
+
+	starpu_init();
+	timing_init();
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+			A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			assert(A[y][x]);
+		}
+	}
+
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+			posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
+			assert(A[y][x]);
+		}
+	}
+
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable ) 
+	 * */
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	if (x <= y) {
+		for (i = 0; i < BLOCKSIZE; i++)
+		for (j = 0; j < BLOCKSIZE; j++)
+		{
+			A[y][x][i*BLOCKSIZE + j] =
+				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+
+			/* make it a little more numerically stable ... ;) */
+			if ((x == y) && (i == j))
+				A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
+		}
+	}
+
+
+
+	for (y = 0; y < nblocks; y++)
+	for (x = 0; x < nblocks; x++)
+	{
+		if (x <= y) {
+			starpu_monitor_blas_data(&A_state[y][x], 0, (uintptr_t)A[y][x], 
+				BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
+		}
+	}
+
+	dw_cholesky_no_stride();
+
+	starpu_shutdown();
+	return 0;
+}
+
+

+ 30 - 0
examples/common/Makefile.in

@@ -0,0 +1,30 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+all: blas.o blas_model.o
+
+blas.o: blas.c blas.h
+	$(CC) $(CFLAGS) blas.c -c -o blas.o
+
+blas_model.o: blas_model.c blas_model.h
+	$(CC) $(CFLAGS) blas_model.c -c -o blas_model.o
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda

+ 241 - 0
examples/common/blas.c

@@ -0,0 +1,241 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <starpu.h>
+#include "blas.h"
+
+/*
+    This files contains BLAS wrappers for the different BLAS implementations
+  (eg. REFBLAS, ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  libraries do not supply C-based ordering.
+ */
+
+#ifdef ATLAS
+
+inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+			float alpha, float *A, int lda, float *B, int ldb, 
+			float beta, float *C, int ldc)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_sgemm(CblasColMajor, ta, tb,
+			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
+}
+
+inline float SASUM(int N, float *X, int incX)
+{
+	return cblas_sasum(N, X, incX);
+}
+
+void SSCAL(int N, float alpha, float *X, int incX)
+{
+	cblas_sscal(N, alpha, X, incX);
+}
+
+void STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+
+	cblas_ssyr(CblasColMajor, uplo_, n, alpha, x, incx, A, lda); 
+}
+
+void SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
+	
+	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
+}
+
+void SGER (const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda)
+{
+	cblas_sger(CblasRowMajor, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE trans_ = (toupper(trans[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strsv(CblasColMajor, uplo_, trans_, diag_, n, A, lda, x, incx);
+}
+
+void STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
+void STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX)
+{
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_strmv(CblasColMajor, uplo_, transA_, diag_, n, A, lda, X, incX);
+}
+
+void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+{
+	cblas_saxpy(n, alpha, X, incX, Y, incY);
+}
+
+int ISAMAX (const int n, float *X, const int incX)
+{
+    int retVal;
+    retVal = cblas_isamax(n, X, incX);
+    return retVal;
+}
+
+float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+{
+	return cblas_sdot(n, x, incx, y, incy);
+}
+
+
+#elif defined(GOTO) || defined(SYSTEM_BLAS)
+
+inline void SGEMM(char *transa, char *transb, int M, int N, int K, 
+			float alpha, float *A, int lda, float *B, int ldb, 
+			float beta, float *C, int ldc)
+{
+	sgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+inline float SASUM(int N, float *X, int incX)
+{
+	return sasum_(&N, X, &incX);
+}
+
+void SSCAL(int N, float alpha, float *X, int incX)
+{
+	sscal_(&N, &alpha, X, &incX);
+}
+
+void STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb)
+{
+	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda)
+{
+	ssyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+}
+
+void SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc)
+{
+	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+}
+
+void SGER (const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda)
+{
+	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+
+void STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx)
+{
+	strsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+}
+
+void STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb)
+{
+	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX)
+{
+	strmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+}
+
+void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incY)
+{
+	saxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+int ISAMAX (const int n, float *X, const int incX)
+{
+    int retVal;
+    retVal = isamax_ (&n, X, &incX);
+    return retVal;
+}
+
+float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
+{
+	float retVal = 0;
+
+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
+	retVal = (float)sdot_(&n, x, &incx, y, &incy);
+
+	return retVal;
+}
+
+#else
+#error "no BLAS lib available..."
+#endif

+ 97 - 0
examples/common/blas.h

@@ -0,0 +1,97 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include <starpu.h>
+
+#ifdef ATLAS
+#include <cblas.h>
+#endif
+
+void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, float *A, int lda, 
+		float *B, int ldb, float beta, float *C, int ldc);
+float SASUM(int N, float *X, int incX);
+void SSCAL(int N, float alpha, float *X, int incX);
+void STRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb);
+void SSYR (const char *uplo, const int n, const float alpha,
+                  const float *x, const int incx, float *A, const int lda);
+void SSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const float alpha, const float *A,
+                   const int lda, const float beta, float *C,
+                   const int ldc);
+void SGER (const int m, const int n, const float alpha,
+                  const float *x, const int incx, const float *y,
+                  const int incy, float *A, const int lda);
+void STRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const float *A, const int lda, float *x, 
+                   const int incx);
+void STRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void STRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const float *A, const int lda, float *X,
+                 const int incX);
+void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
+int ISAMAX (const int n, float *X, const int incX);
+float SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
+
+#if defined(GOTO) || defined(SYSTEM_BLAS)
+
+extern void sgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const float *alpha, 
+                   const float *A, const int *lda, const float *B, 
+                   const int *ldb, const float *beta, float *C, 
+                   const int *ldc);
+extern void ssyr_ (const char *uplo, const int *n, const float *alpha,
+                  const float *x, const int *incx, float *A, const int *lda);
+extern void ssyrk_ (const char *uplo, const char *trans, const int *n,
+                   const int *k, const float *alpha, const float *A,
+                   const int *lda, const float *beta, float *C,
+                   const int *ldc);
+extern void strsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const float *alpha, const float *A, const int *lda,
+                   float *B, const int *ldb);
+extern double sasum_ (const int *n, const float *x, const int *incx);
+extern void sscal_ (const int *n, const float *alpha, float *x,
+                   const int *incx);
+extern void sger_(const int *m, const int *n, const float *alpha,
+                  const float *x, const int *incx, const float *y,
+                  const int *incy, float *A, const int *lda);
+extern void strsv_ (const char *uplo, const char *trans, const char *diag, 
+                   const int *n, const float *A, const int *lda, float *x, 
+                   const int *incx);
+extern void strmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const float *alpha, const float *A, const int *lda,
+                 float *B, const int *ldb);
+extern void strmv_(const char *uplo, const char *transA, const char *diag,
+                 const int *n, const float *A, const int *lda, float *X,
+                 const int *incX);
+extern void saxpy_(const int *n, const float *alpha, float *X, const int *incX,
+		float *Y, const int *incy);
+extern int isamax_(const int *n, float *X, const int *incX);
+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
+extern double sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy);
+#endif
+
+#endif // __BLAS_H__

+ 46 - 0
examples/common/blas_model.c

@@ -0,0 +1,46 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "blas_model.h"
+#include <starpu.h>
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+double gemm_cost(starpu_buffer_descr *descr)
+{
+	/* C = A * B */
+	uint32_t nxC, nyC, nxA;
+
+
+	nxC = starpu_get_blas_nx(descr[2].state);
+	nyC = starpu_get_blas_ny(descr[2].state);
+	nxA = starpu_get_blas_nx(descr[0].state);
+
+//	printf("nxC %d nxC %d nxA %d\n", nxC, nyC, nxA);
+
+	double cost = ((double)nxC)*((double)nyC)*((double)nxA/1000.0f/4.11f);
+
+//	printf("cost %e \n", cost);
+
+	return cost;
+}

+ 41 - 0
examples/common/blas_model.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_MODEL_H__
+#define __BLAS_MODEL_H__
+
+#include <starpu.h>
+
+double gemm_cost(starpu_buffer_descr *descr);
+
+static struct starpu_perfmodel_t sgemm_model = {
+	.cost_model = gemm_cost,
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = "sgemm_atlas"
+#elif defined(GOTO)
+	.symbol = "sgemm_goto"
+#else
+	.symbol = "sgemm"
+#endif
+};
+
+static struct starpu_perfmodel_t sgemm_model_common = {
+	.cost_model = gemm_cost,
+	.type = COMMON,
+};
+
+#endif // __BLAS_MODEL_H__

+ 27 - 0
examples/cuda/incrementer_cuda.cu

@@ -0,0 +1,27 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "incrementer_cuda.h"
+
+extern "C" __global__ 
+void cuda_incrementer(float * tab, uint32_t nx, uint32_t pad1, float *unity, uint32_t nx2, uint32_t pad2)
+{
+	tab[0] = tab[0] + unity[0];
+	tab[1] = tab[1] + unity[1];
+	tab[2] = tab[2] + unity[2];
+	
+	return;
+}

+ 23 - 0
examples/cuda/incrementer_cuda.h

@@ -0,0 +1,23 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __INCREMENTER_CUDA_H__
+#define __INCREMENTER_CUDA_H__
+
+#include <stdint.h>
+#include <cuda.h>
+
+#endif // __INCREMENTER_CUDA_H__

+ 110 - 0
examples/cuda/spmv_cuda.cu

@@ -0,0 +1,110 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdint.h>
+
+#define MIN(a,b)	((a)<(b)?(a):(b))
+
+extern "C" __global__ 
+void spmv_kernel(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
+			uint32_t firstentry, uint32_t elemsize, 
+			float *vecin, uint32_t nx_in, uint32_t elemsize1, float * vecout, uint32_t nx_out, uint32_t elemsize2)
+{
+	/* only one dimension is used here */
+	unsigned nthreads = gridDim.x*blockDim.x;
+	unsigned threadid = threadIdx.x + blockIdx.x*blockDim.x;
+
+	unsigned rowstart = threadid * ((nrow + (nthreads - 1))/nthreads);
+	unsigned rowend = MIN(nrow, (threadid+1) * ((nrow + (nthreads - 1))/nthreads));
+
+	unsigned row;
+	for (row = rowstart; row < rowend; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry; 
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			tmp += nzval[index]*vecin[colind[index]];
+		}
+
+		vecout[row] = tmp;
+	}
+}
+
+extern "C" __global__ 
+void spmv_kernel_2(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
+			uint32_t firstentry, uint32_t elemsize, 
+			float *vecin, uint32_t nx_in, uint32_t elemsize1, float * vecout, uint32_t nx_out, uint32_t elemsize2)
+{
+	/* only one dimension is used here */
+	unsigned block_rowstart = blockIdx.x*( (nrow + gridDim.x - 1)/gridDim.x );
+	unsigned block_rowend = MIN((blockIdx.x+1)*( (nrow + gridDim.x - 1)/gridDim.x ), nrow);
+
+	unsigned row;
+	for (row = block_rowstart + threadIdx.x; row < block_rowend; row+=blockDim.x)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			tmp += nzval[index]*vecin[colind[index]];
+		}
+
+		vecout[row] = tmp;
+	}
+	
+
+}
+
+
+
+extern "C" __global__ 
+void spmv_kernel_3(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
+			uint32_t firstentry, uint32_t elemsize, 
+			float *vecin, uint32_t nx_in, uint32_t elemsize1, float * vecout, uint32_t nx_out, uint32_t elemsize2)
+{
+	/* only one dimension is used here */
+	unsigned block_rowstart = blockIdx.x*( (nrow + gridDim.x - 1)/gridDim.x );
+	unsigned block_rowend = MIN((blockIdx.x+1)*( (nrow + gridDim.x - 1)/gridDim.x ), nrow);
+
+	unsigned row;
+	for (row = block_rowstart + threadIdx.x; row < block_rowend; row+=blockDim.x)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			tmp += nzval[index]*vecin[colind[index]];
+		}
+
+		vecout[row] = tmp;
+	}
+	
+
+}
+

+ 28 - 0
examples/fortran/bindings/Makefile

@@ -0,0 +1,28 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+FC=gfortran
+CFLAGS+= -g -Wall
+FFLAGS+= -g -Wall
+
+all: hello
+
+hello.F: StarPU-fortran.h
+
+hello: hello.F hello-c.o
+
+clean:
+	rm -f *.o *.mod hello

+ 54 - 0
examples/fortran/bindings/StarPU-fortran.h

@@ -0,0 +1,54 @@
+C
+C StarPU
+C Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+C
+C This program is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at
+C your option) any later version.
+C
+C This program is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of
+C MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+C
+C See the GNU Lesser General Public License in COPYING.LGPL for more details.
+C
+
+      MODULE STARPU_FORTRAN
+        USE ISO_C_BINDING
+
+          TYPE codelet
+              REAL :: A,B,C
+          END TYPE codelet
+
+      CONTAINS
+      
+          SUBROUTINE PRINT_INT(X)
+              INTEGER :: X
+              WRITE(*,*) 'X =', X
+          END SUBROUTINE
+
+          SUBROUTINE STARPU_SUBMIT_CODELET(CPUFUNC, ARG)
+              INTEGER :: ARG
+
+              INTERFACE
+                  SUBROUTINE CPUFUNC(ARG)
+                      INTEGER :: ARG
+                  END SUBROUTINE
+              END INTERFACE
+
+              CALL CPUFUNC(ARG)
+          END SUBROUTINE
+
+      END MODULE STARPU_FORTRAN
+
+      MODULE STARPU_FORTRAN2
+        USE ISO_C_BINDING
+
+      CONTAINS
+          SUBROUTINE PRINT_INT2(X)
+              INTEGER :: X
+              WRITE(*,*) 'X =', X
+          END SUBROUTINE
+
+      END MODULE STARPU_FORTRAN2

+ 33 - 0
examples/fortran/bindings/hello-c.c

@@ -0,0 +1,33 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <f77.h>
+
+#define fline_length 80
+
+extern F77_SUBROUTINE(hellosub)( INTEGER(i) TRAIL(line) );
+
+
+void dummy_c_func_(INTEGER(i))
+{
+	fprintf(stderr, "i = %d\n", *INTEGER_ARG(i));
+
+	F77_CALL(hellosub)(INTEGER_ARG(i)TRAIL_ARG(fline));
+}

+ 35 - 0
examples/fortran/bindings/hello.F

@@ -0,0 +1,35 @@
+C
+C StarPU
+C Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+C
+C This program is free software; you can redistribute it and/or modify
+C it under the terms of the GNU Lesser General Public License as published by
+C the Free Software Foundation; either version 2.1 of the License, or (at
+C your option) any later version.
+C
+C This program is distributed in the hope that it will be useful, but
+C WITHOUT ANY WARRANTY; without even the implied warranty of
+C MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+C
+C See the GNU Lesser General Public License in COPYING.LGPL for more details.
+C
+
+#include "StarPU-fortran.h"
+      
+      SUBROUTINE HELLOSUB(X)
+          INTEGER :: X
+          WRITE(*,*) 'X =', X
+      END SUBROUTINE
+
+      PROGRAM HELLO
+          USE STARPU_FORTRAN
+          USE ISO_C_BINDING
+
+          INTEGER :: TOTO
+          TOTO = 42
+
+          CALL STARPU_SUBMIT_CODELET(PRINT_INT, TOTO)
+C           CALL STARPU_SUBMIT_CODELET(HELLOSUB, TOTO)
+
+C          CALL DUMMY_C_FUNC(TOTO)
+      END PROGRAM

+ 41 - 0
examples/heat/Makefile.in

@@ -0,0 +1,41 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+OBJS := ../common/blas.o dw_sparse_cg.o dw_sparse_cg_kernels.o dw_factolu.o dw_factolu_tag.o heat_display.o heat.o lu_kernels_model.o dw_factolu_kernels.o
+
+all: heat
+
+ifeq ($(filter ${MAKECMDGOALS},clean distclean),)
+%.d: %.c
+	$(CC) $(CFLAGS) $< -MM -o $*.d
+
+-include $(OBJS:.o=.d)
+endif
+
+%.o: %.c
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+heat: $(OBJS) 
+	$(CC) $(OBJS) -o heat $(LDFLAGS) $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f heat

+ 749 - 0
examples/heat/dw_factolu.c

@@ -0,0 +1,749 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_factolu.h"
+#include <sys/time.h>
+
+uint8_t *advance_12_21; /* size nblocks*nblocks */
+uint8_t *advance_11; /* size nblocks*nblocks */
+uint8_t *advance_22; /* array of nblocks *nblocks*nblocks */
+
+struct timeval start;
+struct timeval end;
+
+static starpu_codelet cl11 =
+{
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &model_11
+};
+
+static starpu_codelet cl12 =
+{
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u12,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u12,
+#endif
+	.nbuffers = 2,
+	.model = &model_12
+}; 
+
+static starpu_codelet cl21 =
+{
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &model_21
+}; 
+
+static starpu_codelet cl22 =
+{
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &model_22
+}; 
+
+
+
+#define STARTED	0x01
+#define DONE	0x10
+
+/*
+ *	Upgraded Callbacks : break the pipeline design !
+ */
+
+void dw_callback_v2_codelet_update_u22(void *argcb)
+{
+	cl_args *args = argcb;	
+
+	unsigned k = args->k;
+	unsigned i = args->i;
+	unsigned j = args->j;
+	unsigned nblocks = args->nblocks;
+
+	/* we did task 22k,i,j */
+	advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
+	
+	if ( (i == j) && (i == k+1)) {
+		/* we now reduce the LU22 part (recursion appears there) */
+		cl_args *u11arg = malloc(sizeof(cl_args));
+
+		struct starpu_task *task = starpu_task_create();
+			task->callback_func = dw_callback_v2_codelet_update_u11;
+			task->callback_arg = u11arg;
+			task->cl = &cl11;
+			task->cl_arg = u11arg;
+
+			task->buffers[0].state =
+				get_sub_data(args->dataA, 2, k+1, k+1);
+			task->buffers[0].mode = RW;
+	
+		u11arg->dataA = args->dataA;
+		u11arg->i = k + 1;
+		u11arg->nblocks = args->nblocks;
+		u11arg->sem = args->sem;
+
+		/* schedule the codelet */
+		task->priority = MAX_PRIO;
+		starpu_submit_task(task);
+	}
+
+	/* 11k+1 + 22k,k+1,j => 21 k+1,j */
+	if ( i == k + 1) {
+		uint8_t dep;
+		/* 11 k+1*/
+		dep = advance_11[(k+1)];
+		if (dep & DONE) {
+			/* try to push the task */
+			uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
+				if ((u & STARTED) == 0) {
+					/* we are the only one that should 
+					 * launch that task */
+					cl_args *u21a = malloc(sizeof(cl_args));
+
+					struct starpu_task *task21 = starpu_task_create();
+					task21->callback_func = dw_callback_v2_codelet_update_u21;
+					task21->callback_arg = u21a;
+					task21->cl = &cl21;
+					task21->cl_arg = u21a;
+			
+					u21a->i = k+1;
+					u21a->k = j;
+					u21a->nblocks = args->nblocks;
+					u21a->dataA = args->dataA;
+					u21a->sem = args->sem;
+
+					task21->buffers[0].state = 
+						get_sub_data(args->dataA, 2, u21a->i, u21a->i);
+					task21->buffers[0].mode = R;
+					task21->buffers[1].state =
+						get_sub_data(args->dataA, 2, u21a->i, u21a->k);
+					task21->buffers[1].mode = RW;
+		
+					starpu_submit_task(task21);
+				}
+		}
+	}
+
+	/* 11k + 22k-1,i,k => 12 k,i */
+	if (j == k + 1) {
+		uint8_t dep;
+		/* 11 k+1*/
+		dep = advance_11[(k+1)];
+		if (dep & DONE) {
+			/* try to push the task */
+			uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
+				 if ((u & STARTED) == 0) {
+					/* we are the only one that should launch that task */
+					cl_args *u12a = malloc(sizeof(cl_args));
+
+					struct starpu_task *task12 = starpu_task_create();
+						task12->callback_func = dw_callback_v2_codelet_update_u12;
+						task12->callback_arg = u12a;
+						task12->cl = &cl12;
+						task12->cl_arg = u12a;
+
+					u12a->i = k+1;
+					u12a->k = i;
+					u12a->nblocks = args->nblocks;
+					u12a->dataA = args->dataA;
+					u12a->sem = args->sem;
+
+					task12->buffers[0].state = get_sub_data(args->dataA, 2, u12a->i, u12a->i); 
+					task12->buffers[0].mode = R;
+					task12->buffers[1].state = get_sub_data(args->dataA, 2, u12a->k, u12a->i); 
+					task12->buffers[1].mode = RW;
+					
+					starpu_submit_task(task12);
+				}
+		}
+	}
+
+	free(args);
+}
+
+void dw_callback_v2_codelet_update_u12(void *argcb)
+{
+	cl_args *args = argcb;	
+
+	/* now launch the update of LU22 */
+	unsigned i = args->i;
+	unsigned k = args->k;
+	unsigned nblocks = args->nblocks;
+
+	/* we did task 21i,k */
+	advance_12_21[i*nblocks + k] = DONE;
+
+	unsigned slicey;
+	for (slicey = i+1; slicey < nblocks; slicey++)
+	{
+		/* can we launch 22 i,args->k,slicey ? */
+		/* deps : 21 args->k, slicey */
+		uint8_t dep;
+		dep = advance_12_21[i + slicey*nblocks];
+		if (dep & DONE)
+		{
+			/* perhaps we may schedule the 22 i,args->k,slicey task */
+			uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
+                        if ((u & STARTED) == 0) {
+				/* update that square matrix */
+				cl_args *u22a = malloc(sizeof(cl_args));
+
+				struct starpu_task *task22 = starpu_task_create();
+				task22->callback_func = dw_callback_v2_codelet_update_u22;
+				task22->callback_arg = u22a;
+				task22->cl = &cl22;
+				task22->cl_arg = u22a;
+
+				u22a->k = i;
+				u22a->i = k;
+				u22a->j = slicey;
+				u22a->dataA = args->dataA;
+				u22a->nblocks = nblocks;
+				u22a->sem = args->sem;
+
+				task22->buffers[0].state = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
+				task22->buffers[0].mode = R;
+
+				task22->buffers[1].state = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
+				task22->buffers[1].mode = R;
+
+				task22->buffers[2].state = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
+				task22->buffers[2].mode = RW;
+				
+				/* schedule that codelet */
+				if (slicey == i+1) 
+					task22->priority = MAX_PRIO;
+
+				starpu_submit_task(task22);
+			}
+		}
+	}
+}
+
+void dw_callback_v2_codelet_update_u21(void *argcb)
+{
+	cl_args *args = argcb;	
+
+	/* now launch the update of LU22 */
+	unsigned i = args->i;
+	unsigned k = args->k;
+	unsigned nblocks = args->nblocks;
+
+	/* we did task 21i,k */
+	advance_12_21[i + k*nblocks] = DONE;
+
+
+	unsigned slicex;
+	for (slicex = i+1; slicex < nblocks; slicex++)
+	{
+		/* can we launch 22 i,slicex,k ? */
+		/* deps : 12 slicex k */
+		uint8_t dep;
+		dep = advance_12_21[i*nblocks + slicex];
+		if (dep & DONE)
+		{
+			/* perhaps we may schedule the 22 i,args->k,slicey task */
+			uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
+                        if ((u & STARTED) == 0) {
+				/* update that square matrix */
+				cl_args *u22a = malloc(sizeof(cl_args));
+
+				struct starpu_task *task22 = starpu_task_create();
+				task22->callback_func = dw_callback_v2_codelet_update_u22;
+				task22->callback_arg = u22a;
+				task22->cl = &cl22;
+				task22->cl_arg = u22a;
+
+				u22a->k = i;
+				u22a->i = slicex;
+				u22a->j = k;
+				u22a->dataA = args->dataA;
+				u22a->nblocks = nblocks;
+				u22a->sem = args->sem;
+
+				task22->buffers[0].state = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
+				task22->buffers[0].mode = R;
+
+				task22->buffers[1].state = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
+				task22->buffers[1].mode = R;
+
+				task22->buffers[2].state = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
+				task22->buffers[2].mode = RW;
+				
+				/* schedule that codelet */
+				if (slicex == i+1)
+					task22->priority = MAX_PRIO;
+
+				starpu_submit_task(task22);
+			}
+		}
+	}
+}
+
+void dw_callback_v2_codelet_update_u11(void *argcb)
+{
+	/* in case there remains work, go on */
+	cl_args *args = argcb;
+
+	unsigned nblocks = args->nblocks;
+	unsigned i = args->i;
+
+	/* we did task 11k */
+	advance_11[i] = DONE;
+
+	if (i == nblocks - 1) 
+	{
+		/* we are done : wake the application up  */
+		sem_post(args->sem);
+		return;
+	}
+	else 
+	{
+		/* put new tasks */
+		unsigned slice;
+		for (slice = i + 1; slice < nblocks; slice++)
+		{
+
+			/* can we launch 12i,slice ? */
+			uint8_t deps12;
+			if (i == 0) {
+				deps12 = DONE;
+			}
+			else {
+				deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];		
+			}
+			if (deps12 & DONE) {
+				/* we may perhaps launch the task 12i,slice */
+				 uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
+				 if ((u & STARTED) == 0) {
+					/* we are the only one that should launch that task */
+					cl_args *u12a = malloc(sizeof(cl_args));
+
+					struct starpu_task *task12 = starpu_task_create();
+						task12->callback_func = dw_callback_v2_codelet_update_u12;
+						task12->callback_arg = u12a;
+						task12->cl = &cl12;
+						task12->cl_arg = u12a;
+
+					u12a->i = i;
+					u12a->k = slice;
+					u12a->nblocks = args->nblocks;
+					u12a->dataA = args->dataA;
+					u12a->sem = args->sem;
+
+					task12->buffers[0].state = get_sub_data(args->dataA, 2, u12a->i, u12a->i); 
+					task12->buffers[0].mode = R;
+					task12->buffers[1].state = get_sub_data(args->dataA, 2, u12a->k, u12a->i); 
+					task12->buffers[1].mode = RW;
+
+					if (slice == i +1) 
+						task12->priority = MAX_PRIO;
+
+					starpu_submit_task(task12);
+				}
+			}
+
+			/* can we launch 21i,slice ? */
+			if (i == 0) {
+				deps12 = DONE;
+			}
+			else {
+				deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];		
+			}
+			if (deps12 & DONE) {
+				/* we may perhaps launch the task 12i,slice */
+				 uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
+				 if ((u & STARTED) == 0) {
+					/* we are the only one that should launch that task */
+					cl_args *u21a = malloc(sizeof(cl_args));
+
+					struct starpu_task *task21 = starpu_task_create();
+						task21->callback_func = dw_callback_v2_codelet_update_u21;
+						task21->callback_arg = u21a;
+						task21->cl = &cl21;
+						task21->cl_arg = u21a;
+		
+					u21a->i = i;
+					u21a->k = slice;
+					u21a->nblocks = args->nblocks;
+					u21a->dataA = args->dataA;
+					u21a->sem = args->sem;
+
+					task21->buffers[0].state = get_sub_data(args->dataA, 2, u21a->i, u21a->i);
+					task21->buffers[0].mode = R;
+					task21->buffers[1].state = get_sub_data(args->dataA, 2, u21a->i, u21a->k);
+					task21->buffers[1].mode = RW;
+		
+					if (slice == i +1)
+						task21->priority = MAX_PRIO;
+
+					starpu_submit_task(task21);
+				}
+			}
+		}
+	}
+}
+
+
+
+/*
+ *	Callbacks 
+ */
+
+
+void dw_callback_codelet_update_u11(void *argcb)
+{
+	/* in case there remains work, go on */
+	cl_args *args = argcb;
+
+	if (args->i == args->nblocks - 1) 
+	{
+		/* we are done : wake the application up  */
+		sem_post(args->sem);
+		return;
+	}
+	else 
+	{
+		/* put new tasks */
+		unsigned nslices;
+		nslices = args->nblocks - 1 - args->i;
+
+		unsigned *remaining = malloc(sizeof(unsigned));
+		*remaining = 2*nslices; 
+
+		unsigned slice;
+		for (slice = args->i + 1; slice < args->nblocks; slice++)
+		{
+
+			/* update slice from u12 */
+			cl_args *u12a = malloc(sizeof(cl_args));
+
+			/* update slice from u21 */
+			cl_args *u21a = malloc(sizeof(cl_args));
+
+			struct starpu_task *task12 = starpu_task_create();
+				task12->callback_func = dw_callback_codelet_update_u12_21;
+				task12->callback_arg = u12a;
+				task12->cl = &cl12;
+				task12->cl_arg = u12a;
+
+			struct starpu_task *task21 = starpu_task_create();
+				task21->callback_func = dw_callback_codelet_update_u12_21;
+				task21->callback_arg = u21a;
+				task21->cl = &cl21;
+				task21->cl_arg = u21a;
+			
+			u12a->i = args->i;
+			u12a->k = slice;
+			u12a->nblocks = args->nblocks;
+			u12a->dataA = args->dataA;
+			u12a->remaining = remaining;
+			u12a->sem = args->sem;
+			
+			u21a->i = args->i;
+			u21a->k = slice;
+			u21a->nblocks = args->nblocks;
+			u21a->dataA = args->dataA;
+			u21a->remaining = remaining;
+			u21a->sem = args->sem;
+
+			task12->buffers[0].state = 
+				get_sub_data(args->dataA, 2, u12a->i, u12a->i); 
+			task12->buffers[0].mode = R;
+			task12->buffers[1].state = 
+				get_sub_data(args->dataA, 2, u12a->k, u12a->i); 
+			task12->buffers[1].mode = RW;
+
+			task21->buffers[0].state = 
+				get_sub_data(args->dataA, 2, u21a->i, u21a->i);
+			task21->buffers[0].mode = R;
+			task21->buffers[1].state = 
+				get_sub_data(args->dataA, 2, u21a->i, u21a->k);
+			task21->buffers[1].mode = RW;
+		
+			starpu_submit_task(task12);
+			starpu_submit_task(task21);
+		}
+	}
+}
+
+
+void dw_callback_codelet_update_u22(void *argcb)
+{
+	cl_args *args = argcb;	
+
+	if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
+	{
+		/* all worker already used the counter */
+		free(args->remaining);
+
+		/* we now reduce the LU22 part (recursion appears there) */
+		cl_args *u11arg = malloc(sizeof(cl_args));
+	
+		struct starpu_task *task = starpu_task_create();
+			task->callback_func = dw_callback_codelet_update_u11;
+			task->callback_arg = u11arg;
+			task->cl = &cl11;
+			task->cl_arg = u11arg;
+
+			task->buffers[0].state = get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
+			task->buffers[0].mode = RW;
+	
+		u11arg->dataA = args->dataA;
+		u11arg->i = args->k + 1;
+		u11arg->nblocks = args->nblocks;
+		u11arg->sem = args->sem;
+
+		/* schedule the codelet */
+		starpu_submit_task(task);
+	}
+
+	free(args);
+}
+
+void dw_callback_codelet_update_u12_21(void *argcb)
+{
+	cl_args *args = argcb;	
+
+	if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
+	{
+		/* now launch the update of LU22 */
+		unsigned i = args->i;
+		unsigned nblocks = args->nblocks;
+
+		/* the number of tasks to be done */
+		unsigned *remaining = malloc(sizeof(unsigned));
+		*remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
+
+		unsigned slicey, slicex;
+		for (slicey = i+1; slicey < nblocks; slicey++)
+		{
+			for (slicex = i+1; slicex < nblocks; slicex++)
+			{
+				/* update that square matrix */
+				cl_args *u22a = malloc(sizeof(cl_args));
+
+				struct starpu_task *task22 = starpu_task_create();
+				task22->callback_func = dw_callback_codelet_update_u22;
+				task22->callback_arg = u22a;
+				task22->cl = &cl22;
+				task22->cl_arg = u22a;
+
+				u22a->k = i;
+				u22a->i = slicex;
+				u22a->j = slicey;
+				u22a->dataA = args->dataA;
+				u22a->nblocks = nblocks;
+				u22a->remaining = remaining;
+				u22a->sem = args->sem;
+
+				task22->buffers[0].state = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
+				task22->buffers[0].mode = R;
+
+				task22->buffers[1].state = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
+				task22->buffers[1].mode = R;
+
+				task22->buffers[2].state = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
+				task22->buffers[2].mode = RW;
+				
+				/* schedule that codelet */
+				starpu_submit_task(task22);
+			}
+		}
+	}
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
+{
+	cl_args *args = malloc(sizeof(cl_args));
+
+	sem_t sem;
+
+	sem_init(&sem, 0, 0U);
+
+	args->sem = &sem;
+	args->i = 0;
+	args->nblocks = nblocks;
+	args->dataA = dataA;
+
+	gettimeofday(&start, NULL);
+
+	/* inject a new task with this codelet into the system */ 
+	struct starpu_task *task = starpu_task_create();
+		task->callback_func = dw_callback_codelet_update_u11;
+		task->callback_arg = args;
+		task->cl = &cl11;
+		task->cl_arg = args;
+
+		task->buffers[0].state = get_sub_data(dataA, 2, 0, 0);
+		task->buffers[0].mode = RW;
+
+	/* schedule the codelet */
+	starpu_submit_task(task);
+
+	/* stall the application until the end of computations */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
+{
+
+	advance_11 = calloc(nblocks, sizeof(uint8_t));
+	STARPU_ASSERT(advance_11);
+
+	advance_12_21 = calloc(nblocks*nblocks, sizeof(uint8_t));
+	STARPU_ASSERT(advance_12_21);
+
+	advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(uint8_t));
+	STARPU_ASSERT(advance_22);
+
+	cl_args *args = malloc(sizeof(cl_args));
+
+	sem_t sem;
+
+	sem_init(&sem, 0, 0U);
+
+	args->sem = &sem;
+	args->i = 0;
+	args->nblocks = nblocks;
+	args->dataA = dataA;
+
+	gettimeofday(&start, NULL);
+
+	/* inject a new task with this codelet into the system */ 
+	struct starpu_task *task = starpu_task_create();
+		task->callback_func = dw_callback_v2_codelet_update_u11;
+		task->callback_arg = args;
+		task->cl = &cl11;
+		task->cl_arg = args;
+
+		task->buffers[0].state = get_sub_data(dataA, 2, 0, 0); 
+		task->buffers[0].mode = RW;
+
+	/* schedule the codelet */
+	starpu_submit_task(task);
+
+	/* stall the application until the end of computations */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
+{
+	starpu_init();
+
+	timing_init();
+
+	if (pinned)
+	{
+		starpu_malloc_pinned_if_possible(A, dim*dim*sizeof(float));
+		starpu_malloc_pinned_if_possible(B, dim*sizeof(float));
+	} 
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+		*B = malloc(dim*sizeof(float));
+	}
+}
+
+void dw_factoLU(float *matA, unsigned size, 
+		unsigned ld, unsigned nblocks, 
+		unsigned version)
+{
+
+#ifdef CHECK_RESULTS
+	fprintf(stderr, "Checking results ...\n");
+	float *Asaved;
+	Asaved = malloc(ld*ld*sizeof(float));
+
+	memcpy(Asaved, matA, ld*ld*sizeof(float));
+#endif
+
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_monitor_blas_data(&dataA, 0, (uintptr_t)matA, ld, 
+			size, size, sizeof(float));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	switch (version) {
+		case 1:
+			dw_codelet_facto(dataA, nblocks);
+			break;
+		default:
+		case 2:
+			dw_codelet_facto_v2(dataA, nblocks);
+			break;
+	}
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+
+	starpu_delete_data(dataA);
+
+#ifdef CHECK_RESULTS
+	compare_A_LU(Asaved, matA, size, ld);
+#endif
+}

+ 212 - 0
examples/heat/dw_factolu.h

@@ -0,0 +1,212 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_FACTO_LU_H__
+#define __DW_FACTO_LU_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+/* for USE_CUDA */
+#include <starpu_config.h>
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+#include "../common/blas.h"
+
+#include <starpu.h>
+
+#include "lu_kernels_model.h"
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+typedef struct {
+	starpu_data_handle dataA;
+	unsigned i;
+	unsigned j;
+	unsigned k;
+	unsigned nblocks;
+	unsigned *remaining;
+	sem_t *sem;
+} cl_args;
+
+#ifdef CHECK_RESULTS
+static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
+				unsigned size, unsigned ld)
+{
+	unsigned i,j;
+	float *L;
+	float *U;
+
+	L = malloc(size*size*sizeof(float));
+	U = malloc(size*size*sizeof(float));
+
+	memset(L, 0, size*size*sizeof(float));
+	memset(U, 0, size*size*sizeof(float));
+
+	/* only keep the lower part */
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < j; i++)
+		{
+			L[j+i*size] = LU[j+i*ld];
+		}
+
+		/* diag i = j */
+		L[j+j*size] = LU[j+j*ld];
+		U[j+j*size] = 1.0f;
+
+		for (i = j+1; i < size; i++)
+		{
+			U[j+i*size] = LU[j+i*ld];
+		}
+	}
+
+#if 0
+	/* display L */
+	printf("(LU): \n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+//			if (i <= j) {
+				printf("%2.2f\t", LU[j +i*size]);
+//			}
+//			else {
+//				printf(".\t");
+//			}
+		}
+		printf("\n");
+	}
+
+
+
+	/* display L */
+	printf("L: \n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+//			if (i <= j) {
+				printf("%2.2f\t", L[j +i*size]);
+//			}
+//			else {
+//				printf(".\t");
+//			}
+		}
+		printf("\n");
+	}
+
+	/* display U */
+	printf("U: \n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+//			if (i <= j) {
+				printf("%2.2f\t", U[j +i*size]);
+//			}
+//			else {
+//				printf(".\t");
+//			}
+		}
+		printf("\n");
+	}
+
+#endif
+
+
+        /* now A_err = L, compute L*U */
+	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+
+	float max_err = 0.0f;
+	for (i = 0; i < size ; i++)
+	{
+		for (j = 0; j < size; j++) 
+		{
+			max_err = STARPU_MAX(max_err, fabs(  L[j+i*size] - A[j+i*ld]  ));
+		}
+	}
+
+#if 0
+	/* display A */
+	printf("A: \n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+	//		if (i <= j) {
+	      			printf("%2.2f\t", A[j +i*size]);
+	//		}
+	//		else {
+	//			printf(".\t");
+	//		}
+		}
+		printf("\n");
+	}
+
+
+	/* display LU */
+	printf("LU: \n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+	//		if (i <= j) {
+	      			printf("%2.2f\t", L[j +i*size]);
+	//		}
+	//		else {
+	//			printf(".\t");
+	//		}
+		}
+		printf("\n");
+	}
+#endif
+
+	printf("max error between A and L*U = %f \n", max_err);
+}
+#endif // CHECK_RESULTS
+
+void dw_core_codelet_update_u11(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u12(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u21(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u22(starpu_data_interface_t *, void *);
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u12(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args);
+#endif
+
+void dw_callback_codelet_update_u11(void *);
+void dw_callback_codelet_update_u12_21(void *);
+void dw_callback_codelet_update_u22(void *);
+
+void dw_callback_v2_codelet_update_u11(void *);
+void dw_callback_v2_codelet_update_u12(void *);
+void dw_callback_v2_codelet_update_u21(void *);
+void dw_callback_v2_codelet_update_u22(void *);
+
+extern struct starpu_perfmodel_t model_11;
+extern struct starpu_perfmodel_t model_12;
+extern struct starpu_perfmodel_t model_21;
+extern struct starpu_perfmodel_t model_22;
+
+#endif // __DW_FACTO_LU_H__

+ 293 - 0
examples/heat/dw_factolu_kernels.c

@@ -0,0 +1,293 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_factolu.h"
+
+unsigned count_11_core = 0;
+unsigned count_12_core = 0;
+unsigned count_21_core = 0;
+unsigned count_22_core = 0;
+
+unsigned count_11_cublas = 0;
+unsigned count_12_cublas = 0;
+unsigned count_21_cublas = 0;
+unsigned count_22_cublas = 0;
+
+void display_stat_heat(void)
+{
+	fprintf(stderr, "STATS : \n");
+	fprintf(stderr, "11 : core %d (%2.2f) cublas %d (%2.2f)\n", count_11_core, (100.0*count_11_core)/(count_11_core+count_11_cublas), count_11_cublas, (100.0*count_11_cublas)/(count_11_core+count_11_cublas));
+	fprintf(stderr, "12 : core %d (%2.2f) cublas %d (%2.2f)\n", count_12_core, (100.0*count_12_core)/(count_12_core+count_12_cublas), count_12_cublas, (100.0*count_12_cublas)/(count_12_core+count_12_cublas));
+	fprintf(stderr, "21 : core %d (%2.2f) cublas %d (%2.2f)\n", count_21_core, (100.0*count_21_core)/(count_21_core+count_21_cublas), count_21_cublas, (100.0*count_21_cublas)/(count_21_core+count_21_cublas));
+	fprintf(stderr, "22 : core %d (%2.2f) cublas %d (%2.2f)\n", count_22_core, (100.0*count_22_core)/(count_22_core+count_22_cublas), count_22_cublas, (100.0*count_22_cublas)/(count_22_core+count_22_cublas));
+}
+
+/*
+ *   U22 
+ */
+
+static inline void dw_common_core_codelet_update_u22(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
+{
+	float *left 	= (float *)buffers[0].blas.ptr;
+	float *right 	= (float *)buffers[1].blas.ptr;
+	float *center 	= (float *)buffers[2].blas.ptr;
+
+	unsigned dx = buffers[2].blas.nx;
+	unsigned dy = buffers[2].blas.ny;
+	unsigned dz = buffers[0].blas.ny;
+
+	unsigned ld12 = buffers[0].blas.ld;
+	unsigned ld21 = buffers[1].blas.ld;
+	unsigned ld22 = buffers[2].blas.ld;
+
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+
+	switch (s) {
+		case 0:
+			SGEMM("N", "N",	dy, dx, dz, 
+				-1.0f, left, ld21, right, ld12,
+					     1.0f, center, ld22);
+			break;
+
+#ifdef USE_CUDA
+		case 1:
+			cublasSgemm('n', 'n', dx, dy, dz, -1.0f, left, ld21,
+					right, ld12, 1.0f, center, ld22);
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_ASSERT(0);
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void dw_core_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_core_codelet_update_u22(descr, 0, _args);
+	(void)STARPU_ATOMIC_ADD(&count_22_core, 1);
+}
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_core_codelet_update_u22(descr, 1, _args);
+	(void)STARPU_ATOMIC_ADD(&count_22_cublas, 1);
+}
+#endif// USE_CUDA
+
+/*
+ * U12
+ */
+
+static inline void dw_common_codelet_update_u12(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args) {
+	float *sub11;
+	float *sub12;
+
+	sub11 = (float *)buffers[0].blas.ptr;	
+	sub12 = (float *)buffers[1].blas.ptr;
+
+	unsigned ld11 = buffers[0].blas.ld;
+	unsigned ld12 = buffers[1].blas.ld;
+
+	unsigned nx12 = buffers[1].blas.nx;
+	unsigned ny12 = buffers[1].blas.ny;
+	
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+
+	/* solve L11 U12 = A12 (find U12) */
+	switch (s) {
+		case 0:
+			STRSM("L", "L", "N", "N",
+					 nx12, ny12, 1.0f, sub11, ld11, sub12, ld12);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasStrsm('L', 'L', 'N', 'N', ny12, nx12,
+					1.0f, sub11, ld11, sub12, ld12);
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_ASSERT(0);
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void dw_core_codelet_update_u12(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_codelet_update_u12(descr, 0, _args);
+	(void)STARPU_ATOMIC_ADD(&count_12_core, 1);
+}
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u12(starpu_data_interface_t *descr, void *_args)
+{
+	 dw_common_codelet_update_u12(descr, 1, _args);
+	(void)STARPU_ATOMIC_ADD(&count_12_cublas, 1);
+}
+#endif // USE_CUDA
+
+/* 
+ * U21
+ */
+
+static inline void dw_common_codelet_update_u21(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args) {
+	float *sub11;
+	float *sub21;
+
+	sub11 = (float *)buffers[0].blas.ptr;
+	sub21 = (float *)buffers[1].blas.ptr;
+
+	unsigned ld11 = buffers[0].blas.ld;
+	unsigned ld21 = buffers[1].blas.ld;
+
+	unsigned nx21 = buffers[1].blas.nx;
+	unsigned ny21 = buffers[1].blas.ny;
+	
+#ifdef USE_CUDA
+	cublasStatus status;
+#endif
+
+	switch (s) {
+		case 0:
+			STRSM("R", "U", "N", "U", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasStrsm('R', 'U', 'N', 'U', ny21, nx21, 1.0f, sub11, ld11, sub21, ld21);
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_ASSERT(0);
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void dw_core_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
+{
+	 dw_common_codelet_update_u21(descr, 0, _args);
+	(void)STARPU_ATOMIC_ADD(&count_21_core, 1);
+}
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_codelet_update_u21(descr, 1, _args);
+	(void)STARPU_ATOMIC_ADD(&count_21_cublas, 1);
+}
+#endif 
+
+/*
+ *	U11
+ */
+
+static inline void debug_print(float *tab, unsigned ld, unsigned n)
+{
+	unsigned j,i;
+	for (j = 0; j < n; j++)
+	{
+		for (i = 0; i < n; i++)
+		{
+			fprintf(stderr, "%2.2f\t", tab[j+i*ld]);
+		}
+		fprintf(stderr, "\n");
+	}
+	
+	fprintf(stderr, "\n");
+}
+
+static inline void dw_common_codelet_update_u11(starpu_data_interface_t *descr, int s, __attribute__((unused)) void *_args) 
+{
+	float *sub11;
+
+	sub11 = (float *)descr[0].blas.ptr; 
+
+	unsigned nx = descr[0].blas.nx;
+	unsigned ld = descr[0].blas.ld;
+
+	unsigned z;
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				float pivot;
+				pivot = sub11[z+z*ld];
+				STARPU_ASSERT(pivot != 0.0f);
+		
+				SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
+		
+				SGER(nx - z - 1, nx - z - 1, -1.0f,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				float pivot;
+				/* ok that's dirty and ridiculous ... */
+				cublasGetVector(1, sizeof(float), &sub11[z+z*ld], sizeof(float), &pivot, sizeof(float));
+
+				STARPU_ASSERT(pivot != 0.0f);
+				
+				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
+				
+				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
+								&sub11[z+(z+1)*ld], ld,
+								&sub11[(z+1)+z*ld], 1,
+								&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+
+}
+
+
+void dw_core_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_codelet_update_u11(descr, 0, _args);
+	(void)STARPU_ATOMIC_ADD(&count_11_core, 1);
+}
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
+{
+	dw_common_codelet_update_u11(descr, 1, _args);
+	(void)STARPU_ATOMIC_ADD(&count_11_cublas, 1);
+}
+#endif// USE_CUDA

+ 303 - 0
examples/heat/dw_factolu_tag.c

@@ -0,0 +1,303 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_factolu.h"
+
+#define TAG11(k)	( (1ULL<<60) | (unsigned long long)(k))
+#define TAG12(k,i)	(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i)))
+#define TAG21(k,j)	(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j)))
+#define TAG22(k,i,j)	(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j)))
+
+/*
+ *	Construct the DAG
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+static void terminal_callback(void *argcb)
+{
+	sem_t *sem = argcb;
+	sem_post(sem);
+}
+
+static starpu_codelet cl11 = {
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &model_11
+};
+
+static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned nblocks, sem_t *sem)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = RW;
+
+	/* this is an important task */
+	task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	/* the very last task must be notified */
+	if (k == nblocks -1) {
+		task->callback_func = terminal_callback;
+		task->callback_arg = sem;
+	}
+
+	return task;
+}
+
+static starpu_codelet cl12 = {
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u12,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u12,
+#endif
+	.nbuffers = 2,
+	.model = &model_12
+};
+
+static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
+{
+//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+
+	struct starpu_task *task = create_task(TAG12(k, i));
+	
+	task->cl = &cl12;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = get_sub_data(dataA, 2, i, k); 
+	task->buffers[1].mode = RW;
+
+	if (i == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, i), 2, TAG11(k), TAG22(k-1, i, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, i), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl21 = {
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
+{
+	struct starpu_task *task = create_task(TAG21(k, j));
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = RW;
+
+	if (j == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl22 = {
+	.where = ANY,
+	.core_func = dw_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cublas_func = dw_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].state = get_sub_data(dataA, 2, i, k); 
+	task->buffers[0].mode = R;
+	task->buffers[1].state = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = R;
+	task->buffers[2].state = get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG12(k, i), TAG21(k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG12(k, i), TAG21(k, j));
+	}
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	/* create a new codelet */
+	sem_t sem;
+	sem_init(&sem, 0, 0U);
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, nblocks, &sem);
+
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataA, k, i);
+			create_task_21(dataA, k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	starpu_submit_task(entry_task);
+
+	/* stall the application until the end of computations */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+
+#ifdef CHECK_RESULTS
+	fprintf(stderr, "Checking results ...\n");
+	float *Asaved;
+	Asaved = malloc(ld*ld*sizeof(float));
+
+	memcpy(Asaved, matA, ld*ld*sizeof(float));
+#endif
+
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_monitor_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	dw_codelet_facto_v3(dataA, nblocks);
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+
+#ifdef CHECK_RESULTS
+	compare_A_LU(Asaved, matA, size, ld);
+#endif
+}

+ 466 - 0
examples/heat/dw_sparse_cg.c

@@ -0,0 +1,466 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Conjugate gradients for Sparse matrices
+ */
+
+#include "dw_sparse_cg.h"
+
+#ifdef USE_CUDA
+/* CUDA spmv codelet */
+static struct starpu_cuda_module_s cuda_module;
+static struct starpu_cuda_function_s cuda_function;
+static starpu_cuda_codelet_t cuda_codelet;
+
+void initialize_cuda(void)
+{
+	char module_path[1024];
+	sprintf(module_path,
+		"%s/examples/cuda/spmv_cuda.cubin", STARPUDIR);
+	char *function_symbol = "spmv_kernel_3";
+
+	starpu_init_cuda_module(&cuda_module, module_path);
+	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
+
+	cuda_codelet.func = &cuda_function;
+	cuda_codelet.stack = NULL;
+	cuda_codelet.stack_size = 0; 
+
+	cuda_codelet.gridx = grids;
+	cuda_codelet.gridy = 1;
+
+	cuda_codelet.blockx = blocks;
+	cuda_codelet.blocky = 1;
+
+	cuda_codelet.shmemsize = 128;
+}
+
+
+
+
+#endif // USE_CUDA
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	starpu_codelet *cl = malloc(sizeof(starpu_codelet));
+		cl->where = ANY;
+		cl->model = NULL;
+
+	struct starpu_task *task = starpu_task_create();
+		task->cl = cl;
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+static void create_data(float **_nzvalA, float **_vecb, float **_vecx, uint32_t *_nnz, uint32_t *_nrow, uint32_t **_colind, uint32_t **_rowptr)
+{
+	/* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
+	
+	/* example of 3-band matrix */
+	float *nzval;
+	uint32_t nnz;
+	uint32_t *colind;
+	uint32_t *rowptr;
+
+	nnz = 3*size-2;
+
+	nzval = malloc(nnz*sizeof(float));
+	colind = malloc(nnz*sizeof(uint32_t));
+	rowptr = malloc(size*sizeof(uint32_t));
+
+	assert(nzval);
+	assert(colind);
+	assert(rowptr);
+
+
+	/* fill the matrix */
+	unsigned row;
+	unsigned pos = 0;
+	for (row = 0; row < size; row++)
+	{
+		rowptr[row] = pos;
+
+		if (row > 0) {
+			nzval[pos] = 1.0f;
+			colind[pos] = row-1;
+			pos++;
+		}
+		
+		nzval[pos] = 5.0f;
+		colind[pos] = row;
+		pos++;
+
+		if (row < size - 1) {
+			nzval[pos] = 1.0f;
+			colind[pos] = row+1;
+			pos++;
+		}
+	}
+
+	*_nnz = nnz;
+	*_nrow = size;
+	*_nzvalA = nzval;
+	*_colind = colind;
+	*_rowptr = rowptr;
+
+	STARPU_ASSERT(pos == nnz);
+	
+	/* initiate the 2 vectors */
+	float *invec, *outvec;
+	invec = malloc(size*sizeof(float));
+	assert(invec);
+
+	outvec = malloc(size*sizeof(float));
+	assert(outvec);
+
+	/* fill those */
+	unsigned ind;
+	for (ind = 0; ind < size; ind++)
+	{
+		invec[ind] = 2.0f;
+		outvec[ind] = 0.0f;
+	}
+
+	*_vecb = invec;
+	*_vecx = outvec;
+}
+
+void init_problem(void)
+{
+	/* create the sparse input matrix */
+	float *nzval;
+	float *vecb;
+	float *vecx;
+	uint32_t nnz;
+	uint32_t nrow;
+	uint32_t *colind;
+	uint32_t *rowptr;
+
+	create_data(&nzval, &vecb, &vecx, &nnz, &nrow, &colind, &rowptr);
+
+	conjugate_gradient(nzval, vecb, vecx, nnz, nrow, colind, rowptr);
+}
+
+/*
+ *	cg initialization phase 
+ */
+
+void init_cg(struct cg_problem *problem) 
+{
+	problem->i = 0;
+
+	/* r = b  - A x */
+	struct starpu_task *task1 = create_task(1UL);
+	task1->cl->where = CORE;
+	task1->cl->core_func = core_codelet_func_1;
+	task1->cl->nbuffers = 4;
+		task1->buffers[0].state = problem->ds_matrixA;
+		task1->buffers[0].mode = R;
+		task1->buffers[1].state = problem->ds_vecx;
+		task1->buffers[1].mode = R;
+		task1->buffers[2].state = problem->ds_vecr;
+		task1->buffers[2].mode = W;
+		task1->buffers[3].state = problem->ds_vecb;
+		task1->buffers[3].mode = R;
+
+	/* d = r */
+	struct starpu_task *task2 = create_task(2UL);
+	task2->cl->where = CORE;
+	task2->cl->core_func = core_codelet_func_2;
+	task2->cl->nbuffers = 2;
+		task2->buffers[0].state = problem->ds_vecd;
+		task2->buffers[0].mode = W;
+		task2->buffers[1].state = problem->ds_vecr;
+		task2->buffers[1].mode = R;
+	
+	starpu_tag_declare_deps(2UL, 1, 1UL);
+
+	/* delta_new = trans(r) r */
+	struct starpu_task *task3 = create_task(3UL);
+	task3->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task3->cl->cublas_func = cublas_codelet_func_3;
+#endif
+	task3->cl->core_func = core_codelet_func_3;
+	task3->cl_arg = problem;
+	task3->cl->nbuffers = 1;
+		task3->buffers[0].state = problem->ds_vecr;
+		task3->buffers[0].mode = R;
+
+	task3->callback_func = iteration_cg;
+	task3->callback_arg = problem;
+	
+	/* XXX 3 should only depend on 1 ... */
+	starpu_tag_declare_deps(3UL, 1, 2UL);
+
+	/* launch the computation now */
+	starpu_submit_task(task1);
+	starpu_submit_task(task2);
+	starpu_submit_task(task3);
+}
+
+/*
+ *	the inner iteration of the cg algorithm 
+ *		the codelet code launcher is its own callback !
+ */
+
+void launch_new_cg_iteration(struct cg_problem *problem)
+{
+	unsigned iter = problem->i;
+
+	unsigned long long maskiter = (iter*1024);
+
+	/* q = A d */
+	struct starpu_task *task4 = create_task(maskiter | 4UL);
+	task4->cl->where = CORE;
+	task4->cl->core_func = core_codelet_func_4;
+	task4->cl->nbuffers = 3;
+		task4->buffers[0].state = problem->ds_matrixA;
+		task4->buffers[0].mode = R;
+		task4->buffers[1].state = problem->ds_vecd;
+		task4->buffers[1].mode = R;
+		task4->buffers[2].state = problem->ds_vecq;
+		task4->buffers[2].mode = W;
+
+	/* alpha = delta_new / ( trans(d) q )*/
+	struct starpu_task *task5 = create_task(maskiter | 5UL);
+	task5->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task5->cl->cublas_func = cublas_codelet_func_5;
+#endif
+	task5->cl->core_func = core_codelet_func_5;
+	task5->cl_arg = problem;
+	task5->cl->nbuffers = 2;
+		task5->buffers[0].state = problem->ds_vecd;
+		task5->buffers[0].mode = R;
+		task5->buffers[1].state = problem->ds_vecq;
+		task5->buffers[1].mode = R;
+
+	starpu_tag_declare_deps(maskiter | 5UL, 1, maskiter | 4UL);
+
+	/* x = x + alpha d */
+	struct starpu_task *task6 = create_task(maskiter | 6UL);
+	task6->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task6->cl->cublas_func = cublas_codelet_func_6;
+#endif
+	task6->cl->core_func = core_codelet_func_6;
+	task6->cl_arg = problem;
+	task6->cl->nbuffers = 2;
+		task6->buffers[0].state = problem->ds_vecx;
+		task6->buffers[0].mode = RW;
+		task6->buffers[1].state = problem->ds_vecd;
+		task6->buffers[1].mode = R;
+
+	starpu_tag_declare_deps(maskiter | 6UL, 1, maskiter | 5UL);
+
+	/* r = r - alpha q */
+	struct starpu_task *task7 = create_task(maskiter | 7UL);
+	task7->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task7->cl->cublas_func = cublas_codelet_func_7;
+#endif
+	task7->cl->core_func = core_codelet_func_7;
+	task7->cl_arg = problem;
+	task7->cl->nbuffers = 2;
+		task7->buffers[0].state = problem->ds_vecr;
+		task7->buffers[0].mode = RW;
+		task7->buffers[1].state = problem->ds_vecq;
+		task7->buffers[1].mode = R;
+
+	starpu_tag_declare_deps(maskiter | 7UL, 1, maskiter | 6UL);
+
+	/* update delta_* and compute beta */
+	struct starpu_task *task8 = create_task(maskiter | 8UL);
+	task8->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task8->cl->cublas_func = cublas_codelet_func_8;
+#endif
+	task8->cl->core_func = core_codelet_func_8;
+	task8->cl_arg = problem;
+	task8->cl->nbuffers = 1;
+		task8->buffers[0].state = problem->ds_vecr;
+		task8->buffers[0].mode = R;
+
+	starpu_tag_declare_deps(maskiter | 8UL, 1, maskiter | 7UL);
+
+	/* d = r + beta d */
+	struct starpu_task *task9 = create_task(maskiter | 9UL);
+	task9->cl->where = CUBLAS|CORE;
+#ifdef USE_CUDA
+	task9->cl->cublas_func = cublas_codelet_func_9;
+#endif
+	task9->cl->core_func = core_codelet_func_9;
+	task9->cl_arg = problem;
+	task9->cl->nbuffers = 2;
+		task9->buffers[0].state = problem->ds_vecd;
+		task9->buffers[0].mode = RW;
+		task9->buffers[1].state = problem->ds_vecr;
+		task9->buffers[1].mode = R;
+
+	starpu_tag_declare_deps(maskiter | 9UL, 1, maskiter | 8UL);
+
+	task9->callback_func = iteration_cg;
+	task9->callback_arg = problem;
+	
+	/* launch the computation now */
+	starpu_submit_task(task4);
+	starpu_submit_task(task5);
+	starpu_submit_task(task6);
+	starpu_submit_task(task7);
+	starpu_submit_task(task8);
+	starpu_submit_task(task9);
+}
+
+void iteration_cg(void *problem)
+{
+	struct cg_problem *pb = problem;
+
+	printf("i : %d (MAX %d)\n\tdelta_new %f (%f)\n", pb->i, MAXITER, pb->delta_new, sqrt(pb->delta_new / pb->size));
+
+	if ((pb->i < MAXITER) && 
+		(pb->delta_new > pb->epsilon) )
+	{
+		if (pb->i % 1000 == 0)
+			printf("i : %d\n\tdelta_new %f (%f)\n", pb->i, pb->delta_new, sqrt(pb->delta_new / pb->size));
+
+		pb->i++;
+
+		/* we did not reach the stop condition yet */
+		launch_new_cg_iteration(problem);
+	}
+	else {
+		/* we may stop */
+		printf("We are done ... after %d iterations \n", pb->i - 1);
+		printf("i : %d\n\tdelta_new %2.5f\n", pb->i, pb->delta_new);
+		sem_post(pb->sem);
+	}
+}
+
+/*
+ *	initializing the problem 
+ */
+
+void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
+			unsigned nrow, uint32_t *colind, uint32_t *rowptr)
+{
+	/* first declare all the data structures to the runtime */
+
+	starpu_data_handle ds_matrixA;
+	starpu_data_handle ds_vecx, ds_vecb;
+	starpu_data_handle ds_vecr, ds_vecd, ds_vecq; 
+
+	/* first the user-allocated data */
+	starpu_monitor_csr_data(&ds_matrixA, 0, nnz, nrow, 
+			(uintptr_t)nzvalA, colind, rowptr, 0, sizeof(float));
+	starpu_monitor_vector_data(&ds_vecx, 0, (uintptr_t)vecx, nrow, sizeof(float));
+	starpu_monitor_vector_data(&ds_vecb, 0, (uintptr_t)vecb, nrow, sizeof(float));
+
+	/* then allocate the algorithm intern data */
+	float *ptr_vecr, *ptr_vecd, *ptr_vecq;
+
+	unsigned i;
+	ptr_vecr = malloc(nrow*sizeof(float));
+	ptr_vecd = malloc(nrow*sizeof(float));
+	ptr_vecq = malloc(nrow*sizeof(float));
+
+	for (i = 0; i < nrow; i++)
+	{
+		ptr_vecr[i] = 0.0f;
+		ptr_vecd[i] = 0.0f;
+		ptr_vecq[i] = 0.0f;
+	}
+
+	printf("nrow = %d \n", nrow);
+
+	/* and declare them as well */
+	starpu_monitor_vector_data(&ds_vecr, 0, (uintptr_t)ptr_vecr, nrow, sizeof(float));
+	starpu_monitor_vector_data(&ds_vecd, 0, (uintptr_t)ptr_vecd, nrow, sizeof(float));
+	starpu_monitor_vector_data(&ds_vecq, 0, (uintptr_t)ptr_vecq, nrow, sizeof(float));
+
+	/* we now have the complete problem */
+	struct cg_problem problem;
+
+	problem.ds_matrixA = ds_matrixA;
+	problem.ds_vecx    = ds_vecx;
+	problem.ds_vecb    = ds_vecb;
+	problem.ds_vecr    = ds_vecr;
+	problem.ds_vecd    = ds_vecd;
+	problem.ds_vecq    = ds_vecq;
+
+	problem.epsilon = EPSILON;
+	problem.size = nrow;
+	problem.delta_old = 1.0;
+	problem.delta_new = 1.0; /* just to make sure we do at least one iteration */
+
+	/* we need a semaphore to synchronize with callbacks */
+	sem_t sem;
+	sem_init(&sem, 0, 0U);
+	problem.sem  = &sem;
+
+	init_cg(&problem);
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	print_results(vecx, nrow);
+}
+
+
+void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
+			unsigned nrow, uint32_t *colind, uint32_t *rowptr)
+{
+	/* start the runtime */
+	starpu_init();
+
+
+#ifdef USE_CUDA
+	initialize_cuda();
+#endif
+
+	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
+}
+
+#if 0
+int main(__attribute__ ((unused)) int argc,
+	__attribute__ ((unused)) char **argv)
+{
+	parse_args(argc, argv);
+
+	timing_init();
+
+	/* start the runtime */
+	starpu_init();
+
+
+#ifdef USE_CUDA
+	initialize_cuda();
+#endif
+
+	init_problem();
+
+	double timing = timing_delay(&start, &end);
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+
+	return 0;
+}
+#endif

+ 136 - 0
examples/heat/dw_sparse_cg.h

@@ -0,0 +1,136 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_SPARSE_CG_H__
+#define __DW_SPARSE_CG_H__
+
+#include <stdio.h>
+#include <stdint.h>
+#include <semaphore.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu_config.h>
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cublas.h>
+#endif
+
+#include "../common/blas.h"
+
+#define MAXITER	100000
+#define EPSILON	0.0000001f
+
+/* code parameters */
+static uint32_t size = 33554432;
+static unsigned usecpu = 0;
+static unsigned blocks = 512;
+static unsigned grids  = 8;
+
+struct cg_problem {
+	starpu_data_handle ds_matrixA;
+	starpu_data_handle ds_vecx;
+	starpu_data_handle ds_vecb;
+	starpu_data_handle ds_vecr;
+	starpu_data_handle ds_vecd;
+	starpu_data_handle ds_vecq;
+
+	sem_t *sem;
+	
+	float alpha;
+	float beta;
+	float delta_0;
+	float delta_old;
+	float delta_new;
+	float epsilon;
+
+	int i;
+	unsigned size;
+};
+
+/* some useful functions */
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-block") == 0) {
+			char *argptr;
+			blocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-grid") == 0) {
+			char *argptr;
+			grids = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-cpu") == 0) {
+			usecpu = 1;
+		}
+	}
+}
+
+
+static void __attribute__ ((unused)) print_results(float *result, unsigned size)
+{
+	printf("**** RESULTS **** \n");
+	unsigned i;
+
+	for (i = 0; i < STARPU_MIN(size, 16); i++)
+	{
+		printf("%d -> %f\n", i, result[i]);
+	}
+}
+
+void core_codelet_func_1(starpu_data_interface_t *descr, void *arg);
+
+void core_codelet_func_2(starpu_data_interface_t *descr, void *arg);
+
+void cublas_codelet_func_3(starpu_data_interface_t *descr, void *arg);
+void core_codelet_func_3(starpu_data_interface_t *descr, void *arg);
+
+void core_codelet_func_4(starpu_data_interface_t *descr, void *arg);
+
+void core_codelet_func_5(starpu_data_interface_t *descr, void *arg);
+void cublas_codelet_func_5(starpu_data_interface_t *descr, void *arg);
+
+void cublas_codelet_func_6(starpu_data_interface_t *descr, void *arg);
+void core_codelet_func_6(starpu_data_interface_t *descr, void *arg);
+
+void cublas_codelet_func_7(starpu_data_interface_t *descr, void *arg);
+void core_codelet_func_7(starpu_data_interface_t *descr, void *arg);
+
+void cublas_codelet_func_8(starpu_data_interface_t *descr, void *arg);
+void core_codelet_func_8(starpu_data_interface_t *descr, void *arg);
+
+void cublas_codelet_func_9(starpu_data_interface_t *descr, void *arg);
+void core_codelet_func_9(starpu_data_interface_t *descr, void *arg);
+
+void iteration_cg(void *problem);
+
+void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
+			unsigned nrow, uint32_t *colind, uint32_t *rowptr);
+
+#endif // __DW_SPARSE_CG_H__

+ 423 - 0
examples/heat/dw_sparse_cg_kernels.c

@@ -0,0 +1,423 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_sparse_cg.h"
+
+/*
+ *	Algorithm :
+ *		
+ *		i = 0
+ *		r = b - A x
+ *			( d = A x ; r = r - d )
+ *		d = r
+ *		delta_new = trans(r) r
+ *		delta_0 = delta_new
+ *
+ * 		while (i < i_max && delta_new > eps^2 delta_0) 
+ * 		{
+ *			q = A d
+ *			alpha = delta_new / ( trans(d) q )
+ *			x = x + alpha d
+ *			if ( i is divisible by 50 )
+ *				r = b - A x
+ *			else 
+ *				r = r - alpha q
+ *			delta_old = delta_new
+ *			delta_new = trans(r) r
+ *			beta = delta_new / delta_old
+ *			d = r + beta d
+ *			i = i + 1
+ * 		}
+ */
+
+
+/*
+ *	compute r = b - A x
+ *
+ *		descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
+ */
+
+void core_codelet_func_1(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	float *nzval = (float *)descr[0].csr.nzval;
+	uint32_t *colind = descr[0].csr.colind;
+	uint32_t *rowptr = descr[0].csr.rowptr;
+
+	uint32_t firstentry = descr[0].csr.firstentry;
+
+	float *vecx = (float *)descr[1].vector.ptr;
+	float *vecr = (float *)descr[2].vector.ptr;
+	float *vecb = (float *)descr[3].vector.ptr;
+
+
+	uint32_t nnz;
+	uint32_t nrow;
+
+	nnz = descr[0].csr.nnz;
+	nrow = descr[0].csr.nrow;
+
+	unsigned row;
+	for (row = 0; row < nrow; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			unsigned col;
+
+			col = colind[index];
+			tmp += nzval[index]*vecx[col];
+		}
+
+		vecr[row] = vecb[row] - tmp;
+	}
+}
+
+/*
+ *	compute d = r
+ *		descr[0] = d, descr[1] = r
+ */
+void core_codelet_func_2(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	/* simply copy r into d */
+	uint32_t nx = descr[0].vector.nx;
+	size_t elemsize = descr[0].vector.elemsize;
+
+	STARPU_ASSERT(descr[0].vector.nx == descr[1].vector.nx);
+	STARPU_ASSERT(descr[0].vector.elemsize == descr[1].vector.elemsize);
+
+	float *src = (float *)descr[1].vector.ptr;
+	float *dst = (float *)descr[0].vector.ptr;
+
+	memcpy(dst, src, nx*elemsize);
+}
+
+/*
+ *	compute delta_new = trans(r) r
+ *		delta_0   = delta_new
+ *
+ *		args = &delta_new, &delta_0
+ */
+
+void core_codelet_func_3(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float dot;
+	float *vec;
+	int size;
+	
+	/* get the vector */
+	vec = (float *)descr[0].vector.ptr;
+	size = (int)descr[0].vector.nx;
+
+	dot = SDOT(size, vec, 1, vec, 1);
+
+	fprintf(stderr, "func 3 : DOT = %f\n", dot);
+
+	pb->delta_new = dot;
+	pb->delta_0 = dot;
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_3(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float dot;
+	float *vec;
+	uint32_t size;
+	
+	/* get the vector */
+	vec = (float *)descr[0].vector.ptr;
+	size = descr[0].vector.nx;
+
+	dot = cublasSdot (size, vec, 1, vec, 1);
+
+	pb->delta_new = dot;
+	pb->delta_0 = dot;
+}
+#endif
+
+
+/*
+ *	compute q with : q = A d
+ *
+ *		descr[0] = A, descr[1] = d, descr [2] = q
+ */
+
+void core_codelet_func_4(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	float *nzval = (float *)descr[0].csr.nzval;
+	uint32_t *colind = descr[0].csr.colind;
+	uint32_t *rowptr = descr[0].csr.rowptr;
+
+	uint32_t firstentry = descr[0].csr.firstentry;
+
+	float *vecd = (float *)descr[1].vector.ptr;
+	float *vecq = (float *)descr[2].vector.ptr;
+
+	uint32_t nnz;
+	uint32_t nrow;
+
+	nnz = descr[0].csr.nnz;
+	nrow = descr[0].csr.nrow;
+
+	unsigned row;
+	for (row = 0; row < nrow; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstentry;
+		unsigned lastindex = rowptr[row+1] - firstentry;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			unsigned col;
+
+			col = colind[index];
+			tmp += nzval[index]*vecd[col];
+		}
+
+		vecq[row] = tmp;
+	}
+
+}
+
+/* 
+ *	compute alpha = delta_new / ( trans(d) q )
+ *
+ * 		descr[0] = d, descr[1] = q
+ *		args = &alpha, &delta_new
+ */
+
+void core_codelet_func_5(starpu_data_interface_t *descr, void *arg)
+{
+	float dot;
+	struct cg_problem *pb = arg;
+	float *vecd, *vecq;
+	uint32_t size;
+	
+	/* get the vector */
+	vecd = (float *)descr[0].vector.ptr;
+	vecq = (float *)descr[1].vector.ptr;
+
+	STARPU_ASSERT(descr[1].vector.nx == descr[0].vector.nx);
+	size = descr[0].vector.nx;
+
+	dot = SDOT(size, vecd, 1, vecq, 1);
+
+	pb->alpha = pb->delta_new / dot;
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_5(starpu_data_interface_t *descr, void *arg)
+{
+	float dot;
+	struct cg_problem *pb = arg;
+	float *vecd, *vecq;
+	uint32_t size;
+	
+	/* get the vector */
+	vecd = (float *)descr[0].vector.ptr;
+	vecq = (float *)descr[1].vector.ptr;
+
+	STARPU_ASSERT(descr[1].vector.nx == descr[0].vector.nx);
+	size = descr[0].vector.nx;
+
+	dot = cublasSdot (size, vecd, 1, vecq, 1);
+
+	pb->alpha = pb->delta_new / dot;
+}
+#endif
+
+
+
+/*
+ *	compute x = x + alpha d
+ *
+ * 		descr[0] : x, descr[1] : d
+ *		args = &alpha
+ */
+
+void core_codelet_func_6(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecx, *vecd;
+	uint32_t size;
+	
+	/* get the vector */
+	vecx = (float *)descr[0].vector.ptr;
+	vecd = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_6(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecx, *vecd;
+	uint32_t size;
+	
+	/* get the vector */
+	vecx = (float *)descr[0].vector.ptr;
+	vecd = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
+}
+#endif
+
+/*
+ *	compute r = r - alpha q
+ *
+ * 		descr[0] : r, descr[1] : q
+ *		args = &alpha
+ */
+
+void core_codelet_func_7(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecr, *vecq;
+	uint32_t size;
+	
+	/* get the vector */
+	vecr = (float *)descr[0].vector.ptr;
+	vecq = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_7(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecr, *vecq;
+	uint32_t size;
+	
+	/* get the vector */
+	vecr = (float *)descr[0].vector.ptr;
+	vecq = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
+}
+#endif
+
+/*
+ *	compute delta_old = delta_new
+ *		delta_new = trans(r) r
+ *		beta = delta_new / delta_old
+ *
+ * 		descr[0] = r
+ *		args = &delta_old, &delta_new, &beta
+ */
+
+void core_codelet_func_8(starpu_data_interface_t *descr, void *arg)
+{
+	float dot;
+	struct cg_problem *pb = arg;
+	float *vecr;
+	uint32_t size;
+	
+	/* get the vector */
+	vecr = (float *)descr[0].vector.ptr;
+	size = descr[0].vector.nx;
+
+	dot = SDOT(size, vecr, 1, vecr, 1);
+
+	pb->delta_old = pb->delta_new;
+	pb->delta_new = dot;
+	pb->beta = pb->delta_new/pb->delta_old;
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_8(starpu_data_interface_t *descr, void *arg)
+{
+	float dot;
+	struct cg_problem *pb = arg;
+	float *vecr;
+	uint32_t size;
+	
+	/* get the vector */
+	vecr = (float *)descr[0].vector.ptr;
+	size = descr[0].vector.nx;
+
+	dot = cublasSdot (size, vecr, 1, vecr, 1);
+
+	pb->delta_old = pb->delta_new;
+	pb->delta_new = dot;
+	pb->beta = pb->delta_new/pb->delta_old;
+}
+#endif
+
+/*
+ *	compute d = r + beta d
+ *
+ * 		descr[0] : d, descr[1] : r
+ *		args = &beta
+ *
+ */
+
+void core_codelet_func_9(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecd, *vecr;
+	uint32_t size;
+	
+	/* get the vector */
+	vecd = (float *)descr[0].vector.ptr;
+	vecr = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	/* d = beta d */
+	SSCAL(size, pb->beta, vecd, 1);
+
+	/* d = r + d */
+	SAXPY (size, 1.0f, vecr, 1, vecd, 1);
+}
+
+#ifdef USE_CUDA
+void cublas_codelet_func_9(starpu_data_interface_t *descr, void *arg)
+{
+	struct cg_problem *pb = arg;
+	float *vecd, *vecr;
+	uint32_t size;
+	
+	/* get the vector */
+	vecd = (float *)descr[0].vector.ptr;
+	vecr = (float *)descr[1].vector.ptr;
+
+	size = descr[0].vector.nx;
+
+	/* d = beta d */
+	cublasSscal(size, pb->beta, vecd, 1);
+
+	/* d = r + d */
+	cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
+}
+#endif

+ 766 - 0
examples/heat/heat.c

@@ -0,0 +1,766 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "heat.h"
+
+/* default values */
+static unsigned ntheta = 32+2;
+static unsigned nthick = 32+2;
+static unsigned nblocks = 16;
+static unsigned shape = 0;
+static unsigned pinned = 0;
+static unsigned version = 2;
+static unsigned use_cg = 0; /* use a LU decomposition of CG ? */
+
+static int argc_;
+static char **argv_;
+
+extern void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
+              		unsigned nrow, uint32_t *colind, uint32_t *rowptr);
+
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-cg") == 0) {
+			use_cg = 1;
+		}
+
+		if (strcmp(argv[i], "-shape") == 0) {
+		        char *argptr;
+			shape = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nthick") == 0) {
+		        char *argptr;
+			nthick = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-ntheta") == 0) {
+		        char *argptr;
+			ntheta = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+		        char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-v1") == 0) {
+			version = 1;
+		}
+
+		if (strcmp(argv[i], "-v2") == 0) {
+			version = 2;
+		}
+
+		if (strcmp(argv[i], "-v3") == 0) {
+			version = 3;
+		}
+
+		if (strcmp(argv[i], "-pin") == 0) {
+			pinned = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg]\n", argv[0]);
+		}
+	}
+}
+
+
+
+/*
+ * The Finite element method code 
+ *
+ *   B              C
+ *	**********
+ *	*  0   * *
+ *	*    *   *
+ *	*  *   1 *
+ *	**********
+ *   A             D
+ */
+
+static inline float diff_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
+		 unsigned theta_psi, unsigned thick_psi, unsigned xy, point *pmesh)
+{
+	float xa,ya,xb,yb,xc,yc;
+	float tmp;
+
+	assert(theta_tr + 2 <= ntheta);
+	assert(thick_tr + 2 <= nthick);
+
+	/* A */
+	xa = pmesh[NODE_NUMBER(theta_tr, thick_tr)].x;
+	ya = pmesh[NODE_NUMBER(theta_tr, thick_tr)].y;
+
+	/* B */
+	if (side_tr) {
+		/* lower D is actually B here */
+		xb = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].x;
+		yb = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].y;
+	} else {
+		/* upper */
+		xb = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].x;
+		yb = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].y;
+	}
+
+	xc = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].x;
+	yc = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].y;
+
+	/* now look for the actual psi node */
+	if (NODE_NUMBER(theta_tr, thick_tr) == NODE_NUMBER(theta_psi, thick_psi)) {
+		/* A nothing to do */
+	} else if (NODE_NUMBER(theta_tr+1, thick_tr+1) == NODE_NUMBER(theta_psi, thick_psi)) {
+		/* psi matches C */
+		/* swap A and C coordinates  */
+		tmp = xa; xa = xc; xc = tmp;
+		tmp = ya; ya = yc; yc = tmp;
+	} else if
+		(side_tr && (NODE_NUMBER(theta_tr+1, thick_tr) == NODE_NUMBER(theta_psi, thick_psi))) {
+		/* psi is D (that was stored in C) XXX */
+		tmp = xa; xa = xb; xb = tmp;
+		tmp = ya; ya = yb; yb = tmp;
+	} else if
+		(!side_tr && (NODE_NUMBER(theta_tr, thick_tr+1) == NODE_NUMBER(theta_psi, thick_psi))) {
+		/* psi is C */
+		tmp = xa; xa = xb; xb = tmp;
+		tmp = ya; ya = yb; yb = tmp;
+	} else {
+		/* the psi node is not a node of the current triangle */
+		return 0.0f;
+	}
+
+	/* now the triangle should have A as the psi node */
+	float denom;
+	float value;
+
+	denom = (xa - xb)*(yc - ya) - (xc - xb)*(ya - yb);
+
+	switch (xy) {
+		case X:
+			value = (yc - yb)/denom;
+			break;
+		case Y:
+			value = -(xc - xb)/denom;
+			break;
+		default:
+			assert(0);
+	}
+
+	return value;
+}
+
+static inline float diff_y_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
+		 unsigned theta_psi, unsigned thick_psi, point *pmesh)
+{
+	return diff_psi(theta_tr, thick_tr, side_tr, theta_psi, thick_psi, Y, pmesh);
+}
+
+static inline float diff_x_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
+		 unsigned theta_psi, unsigned thick_psi, point *pmesh)
+{
+	return diff_psi(theta_tr, thick_tr, side_tr, theta_psi, thick_psi, X, pmesh);
+}
+
+static inline float surface_triangle(unsigned theta_tr, unsigned thick_tr, unsigned side_tr, point *pmesh)
+{
+	float surface;
+	float tmp;
+
+	float xi, xj, xk, yi, yj, yk;
+
+	STARPU_ASSERT(theta_tr + 2 <= ntheta);
+	STARPU_ASSERT(thick_tr + 2 <= nthick);
+
+	xi = pmesh[NODE_NUMBER(theta_tr, thick_tr)].x;
+	yi = pmesh[NODE_NUMBER(theta_tr, thick_tr)].y;
+
+	xj = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].x;
+	yj = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].y;
+
+	if (side_tr) {
+		/* lower */
+		xk = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].x;
+		yk = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].y;
+	} else {
+		xk = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].x;
+		yk = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].y;
+	}
+
+	tmp = (xi - xj)*(yk -yj) - (xk - xj)*(yi -yj);
+
+	surface = 0.5*fabs(tmp);
+
+	return surface;
+}
+
+static inline float integral_triangle(int theta_tr, int thick_tr, unsigned side_tr,
+			unsigned theta_i, unsigned thick_i, unsigned theta_j, unsigned thick_j, point *pmesh)
+{
+	float surface;
+	float value;
+
+	float dxi, dxj, dyi, dyj;
+
+	if (theta_tr < 0) return 0.0f;
+	if (theta_tr + 2  > (int)ntheta) return 0.0f;
+
+	if (thick_tr < 0) return 0.0f;
+	if (thick_tr + 2 > (int)nthick) return 0.0f;
+
+	dxi = diff_x_psi(theta_tr, thick_tr, side_tr, theta_i, thick_i, pmesh);
+	dyi = diff_y_psi(theta_tr, thick_tr, side_tr, theta_i, thick_i, pmesh);
+	dxj = diff_x_psi(theta_tr, thick_tr, side_tr, theta_j, thick_j, pmesh);
+	dyj = diff_y_psi(theta_tr, thick_tr, side_tr, theta_j, thick_j, pmesh);
+
+	surface = surface_triangle(theta_tr, thick_tr, side_tr, pmesh);
+
+	value = (dxi*dxj + dyi*dyj)*surface;
+
+	return value;
+}
+
+static inline float integrale_sum(unsigned theta_i, unsigned thick_i, unsigned theta_j, unsigned thick_j, point *pmesh)
+{
+	float integral = 0.0f;
+
+	integral += integral_triangle(theta_i - 1, thick_i - 1, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
+	integral += integral_triangle(theta_i - 1, thick_i - 1, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
+	integral += integral_triangle(theta_i - 1, thick_i, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
+	integral += integral_triangle(theta_i, thick_i, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
+	integral += integral_triangle(theta_i, thick_i, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
+	integral += integral_triangle(theta_i, thick_i - 1, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
+
+	return integral;
+}
+
+
+static float compute_A_value(unsigned i, unsigned j, point *pmesh)
+{
+	float value = 0.0f;
+
+	unsigned thick_i, thick_j;
+	unsigned theta_i, theta_j;
+
+	/* add all contributions from all connex triangles  */
+	thick_i = NODE_TO_THICK(i);
+	thick_j = NODE_TO_THICK(j);
+
+	theta_i = NODE_TO_THETA(i);
+	theta_j = NODE_TO_THETA(j);
+
+	/* Compute the Sum of all the integral over all triangles */
+	if ( (abs(thick_i - thick_j) <= 1) && (abs(theta_i - theta_j) <= 1) )
+	{
+		if ( (theta_j == theta_i -1) && (thick_j == thick_i +1))
+			goto done;
+
+		if ( (theta_j == theta_i + 1) && (thick_j == thick_i  - 1))
+			goto done;
+
+		/* this may not be a null entry */
+		value += integrale_sum(theta_i, thick_i, theta_j, thick_j, pmesh);
+	}
+
+done:
+	return value;
+}
+
+
+#define TRANSLATE(k)	(RefArray[(k)])
+#define TRANSLATEBACK(k)	(RefArrayBack[(k)])
+
+static void solve_system(unsigned size, unsigned subsize, float *result, int *RefArray, float *Bformer, float *A, float *B)
+{
+
+
+	unsigned i;
+
+	/* solve the actual problem LU X = B */
+        /* solve LX' = Y with X' = UX */
+        /* solve UX = X' */
+	fprintf(stderr, "Solving the problem ...\n");
+
+#ifdef CHECK_RESULTS
+	float *savedB = malloc(subsize*sizeof(float));
+	memcpy(savedB, B, subsize*sizeof(float));
+
+	float *LUB = malloc(subsize*sizeof(float));
+#endif
+
+	/* L */
+	STRSV("L", "N", "N", subsize, A, subsize, B, 1);
+
+	/* U */
+        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
+
+	STARPU_ASSERT(DIM == size);
+
+#ifdef CHECK_RESULTS
+	/* compute the error on (LUB - savedB) which should be 0 */
+
+	/* LUB = B */
+	memcpy(LUB, B, subsize*sizeof(float));
+
+
+	/* LUB = U * LUB */
+	STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
+	
+	/* LUB = L * LUB */
+	STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
+
+	/* LUB -= B */
+	SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
+
+	/* check if LUB is close to the 0 vector */
+	int maxind = ISAMAX(subsize, LUB, 1);
+	fprintf(stderr, "max (LUX - B) = %f\n",LUB[maxind - 1]);
+
+	free(LUB);
+	free(savedB);
+#endif
+
+	/* now display back the ACTUAL result */
+	for (i = 0; i < subsize; i++)
+	{
+		result[TRANSLATE(i)] = B[i];
+	}
+
+	for (i = subsize ; i < size; i++)
+	{
+		result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
+	}
+
+}
+
+unsigned compute_pivot_array(int *RefArray, int *RefArrayBack, unsigned size)
+{
+	unsigned k;
+	unsigned index = 0;
+	unsigned theta, thick;
+	unsigned newsize;
+
+	for (k = 0; k < size; k++)
+	{
+		RefArray[k] = k;
+		RefArrayBack[k] = k;
+	}
+
+	/* first inner nodes */
+	for (theta = 1; theta < ntheta - 1 ; theta++)
+	{
+		for (thick = 1; thick < nthick - 1; thick++) 
+		{
+			/* inner nodes are unknown */
+			RefArrayBack[NODE_NUMBER(theta, thick)] = index;
+			RefArray[index] = NODE_NUMBER(theta, thick);
+
+			index++;
+		}
+	}
+
+	newsize = index;
+
+	for (theta=0; theta < ntheta; theta++)
+	{
+		/* Lower boundary "South" */
+		RefArrayBack[NODE_NUMBER(theta, 0)] = index;
+		RefArray[index++] = NODE_NUMBER(theta, 0);
+		
+		/* Upper boundary "North" */
+		RefArrayBack[NODE_NUMBER(theta, nthick-1)] = index;
+		RefArray[index++] = NODE_NUMBER(theta, nthick-1);
+	}
+
+	for (thick = 1; thick < nthick -1; thick++)
+	{
+		/* "West "*/
+		RefArrayBack[NODE_NUMBER(0, thick)] = index;
+		RefArray[index++] = NODE_NUMBER(0, thick);
+
+		/* "East" */
+		RefArrayBack[NODE_NUMBER(ntheta-1, thick)] = index;
+		RefArray[index++] = NODE_NUMBER(ntheta-1, thick);
+	}
+
+	assert(index == size);
+
+	return newsize;
+}
+
+void build_mesh(point *mesh)
+{
+	unsigned theta, thick;
+
+	/* first build the mesh by determining all points positions */
+	for (theta = 0; theta < ntheta; theta++)
+	{
+		float angle;
+		angle = (ntheta - 1 - theta) * Pi/(ntheta-1);
+
+		for (thick = 0; thick < nthick; thick++)
+		{
+			float r;
+			r = thick * (RMAX - RMIN)/(nthick - 1) + RMIN;
+
+			switch (shape) {
+				default:
+				case 0:
+					mesh[NODE_NUMBER(theta,thick)].x = r*cosf(angle);
+					mesh[NODE_NUMBER(theta,thick)].y = r*sinf(angle);
+					break;
+				case 1:
+					mesh[NODE_NUMBER(theta,thick)].x =
+							-100 + RMIN+((RMAX-RMIN)*theta)/(ntheta - 1);
+					mesh[NODE_NUMBER(theta,thick)].y = 
+							RMIN+((RMAX-RMIN)*thick)/(nthick - 1);
+					break;
+				case 2:
+					mesh[NODE_NUMBER(theta,thick)].x = r*(2.0f*theta/(ntheta - 1)- 1.0f);
+					mesh[NODE_NUMBER(theta,thick)].y = r*(2.0f*thick/(nthick - 1)- 1.0f);
+					break;
+			}
+		}
+	}
+}
+
+static unsigned build_neighbour_vector(unsigned *neighbours, unsigned node, int *RefArray, int *RefArrayBack)
+{
+	/* where is that point in the former space ? */
+	int former = TRANSLATE(node);
+	int former_thick, former_theta;
+	former_thick= (int)NODE_TO_THICK(former);
+	former_theta = (int)NODE_TO_THETA(former);
+
+	/* do a list of all the possible neighbours */
+	unsigned nneighbours = 0;
+
+	int dtheta, dthick;
+	for (dthick = -1; dthick <= 1; dthick++)
+	{
+		if ((former_thick + dthick) >= 0 && (former_thick + dthick) <= (int)nthick )
+		{
+			for (dtheta = -1; dtheta <= 1; dtheta++)
+			{
+				if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
+				{
+					/* we got a possible neighbour */
+					unsigned node = 
+						NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
+
+					neighbours[nneighbours++] = TRANSLATEBACK(node);
+				}
+			}
+		}
+	}
+
+	unsigned i;
+	/* order that list */
+	for (i = 0; i < nneighbours; i++)
+	{
+		/* find the i^th smallest entry for position i */
+		unsigned index;
+		unsigned min , min_index;
+
+		min = neighbours[i];
+		min_index = i;
+
+		for (index = i+1; index < nneighbours; index++)
+		{
+			STARPU_ASSERT(neighbours[i] != neighbours[index]);
+
+			if (neighbours[index] < min)
+			{
+				min = neighbours[index];
+				min_index = index;
+			}
+		}
+
+		/* swap values */
+		neighbours[min_index] = neighbours[i];
+		neighbours[i] = min;
+	}
+
+	return nneighbours;
+}
+
+static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bformer, unsigned size, unsigned newsize, int *RefArray, int *RefArrayBack)
+{
+	unsigned i,j;
+
+	/* first give the value of known nodes (at boundaries) */
+	for (i = 0; i < size; i++)
+	{
+		Bformer[i] = 0.0f;
+	}
+
+	for (i = 0; i < nthick; i++)
+	{
+		Bformer[i] = 200.0f;
+		Bformer[size-1-i] = 200.0f;
+	}
+
+	for (i = 1; i < ntheta-1; i++)
+	{
+		Bformer[i*nthick] = 200.0f;
+		Bformer[(i+1)*nthick-1] = 100.0f;
+	}
+
+	/* now the actual stiffness (reordered) matrix*/
+	for (j = 0 ; j < newsize ; j++)
+	{
+
+		unsigned neighbour;
+		unsigned nneighbours;
+		unsigned neighbours[9];
+
+		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
+
+		B[j] = Bformer[TRANSLATE(j)];
+
+		for (neighbour = 0; neighbour < nneighbours; neighbour++)
+		{
+			unsigned i = neighbours[neighbour]; 
+			if (i >= newsize)
+			{
+				B[j] -= compute_A_value(TRANSLATE(i), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(i)];
+			}
+		}
+	}
+}
+
+static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uint32_t **colind, 
+						uint32_t *rowptr, unsigned newsize, int *RefArray, int *RefArrayBack)
+{
+	unsigned j;
+
+	unsigned pos = 0;
+
+	*nzval = NULL;
+	*colind = NULL;
+
+	/* now the actual stiffness (reordered) matrix*/
+	for (j = 0 ; j < newsize ; j++)
+	{
+		rowptr[j] = pos;
+
+		unsigned neighbour;
+		unsigned nneighbours;
+		unsigned neighbours[9];
+
+		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
+
+		for (neighbour = 0; neighbour < nneighbours; neighbour++)
+		{
+			float val;
+			unsigned nodeneighbour =  neighbours[neighbour];
+
+			if (nodeneighbour < newsize) {
+
+				val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
+	
+				if (val != 0.0f) {
+					*nzval = realloc(*nzval, (pos+1)*sizeof(float));
+					*colind = realloc(*colind, (pos+1)*sizeof(uint32_t));
+	
+					(*nzval)[pos] = val;
+					(*colind)[pos] = nodeneighbour;
+
+					pos++;
+				}
+			}
+		}
+	}
+
+	rowptr[newsize] = pos;
+
+
+
+	return pos;
+}
+
+static void build_dense_stiffness_matrix_A(point *pmesh, float *A, unsigned newsize, int *RefArray, int *RefArrayBack)
+{
+	unsigned j;
+
+	/* touch all the memory */
+	memset(A, 0, newsize*newsize*sizeof(float));
+
+	/* now the actual stiffness (reordered) matrix*/
+	for (j = 0 ; j < newsize ; j++)
+	{
+		unsigned neighbour;
+		unsigned nneighbours;
+		unsigned neighbours[9];
+
+		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
+
+		for (neighbour = 0; neighbour < nneighbours; neighbour++)
+		{
+			unsigned nodeneighbour =  neighbours[neighbour];
+
+			if (nodeneighbour < newsize) {
+				float val;
+				val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
+				A[j+ newsize*nodeneighbour] = val;
+			}
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	float *A;
+	float *B;
+
+	unsigned newsize;
+	float *result;
+	int *RefArray, *RefArrayBack;
+	point *pmesh;
+	float *Bformer;
+
+	argc_ = argc;
+	argv_ = argv;
+
+#ifdef USE_MARCEL
+	marcel_init(&argc, argv);
+#endif
+
+	parse_args(argc, argv);
+
+	pmesh = malloc(DIM*sizeof(point));
+	RefArray = malloc(DIM*sizeof(int));
+	RefArrayBack = malloc(DIM*sizeof(int));
+	Bformer = malloc(DIM*sizeof(float));
+	result = malloc(DIM*sizeof(float));
+
+	build_mesh(pmesh);
+
+#ifdef USE_POSTSCRIPT
+	postscript_gen();
+#endif
+
+	/* now simplify that problem given the boundary conditions 
+	 * to do so, we remove the already known variables from the system
+	 * by pivoting the various know variable, RefArray keep track of that
+	 * pivoting */ 
+	newsize = compute_pivot_array(RefArray, RefArrayBack, DIM);
+
+	/* we can either use a direct method (LU decomposition here) or an 
+	 * iterative method (conjugate gradient here) */
+	if (use_cg) {
+		unsigned nnz;
+		float *nzval;
+		uint32_t *colind;
+		uint32_t *rowptr;
+
+		rowptr = malloc((newsize+1)*sizeof(uint32_t));
+
+		B = malloc(newsize*sizeof(float));
+
+		build_sparse_stiffness_matrix_B(pmesh, B, Bformer, DIM, newsize, RefArray, RefArrayBack);
+
+		nnz = build_sparse_stiffness_matrix_A(pmesh, &nzval, &colind, rowptr, newsize, RefArray, RefArrayBack);
+
+#if 0
+		printf("nnz : %d\n", nnz);
+
+		fprintf(stdout, "MUMPS FORMAT BEGIN\n");
+		FILE *fm = fopen("input_mumps", "w+");
+		fprintf(fm, "%d\t:N\n%d\t:NZ\n", newsize, nnz);
+
+		unsigned r;
+		for (r = 0; r < newsize; r++)
+		{
+			int first_ind = rowptr[r];
+			int last_ind = rowptr[r+1];
+
+			int ind;
+			for (ind = first_ind; ind < last_ind; ind++)
+			{
+				 fprintf(fm, "%d %d %f\n", colind[ind]+1, r+1, nzval[ind]);
+			}
+		} 
+
+		for (r = 0; r < newsize; r++)
+		{
+			fprintf(fm, "%f\n", B[r]);
+		}
+	
+		fclose(fm);
+		fprintf(stdout, "MUMPS FORMAT END\n");
+#endif		
+
+		do_conjugate_gradient(nzval, B, result, nnz, newsize, colind, rowptr);
+
+		/* XXX */
+		memcpy(B, result, newsize*sizeof(float));
+
+		/* now display back the ACTUAL result */
+		unsigned i;
+		for (i = 0; i < newsize; i++)
+		{
+			result[TRANSLATE(i)] = B[i];
+		}
+	
+		for (i = newsize ; i < DIM; i++)
+		{
+			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
+		}
+	
+	}
+	else {
+
+		/* unfortunately CUDA does not allow late memory registration, 
+		 * we need to do the malloc using CUDA itself ... */
+		initialize_system(&A, &B, newsize, pinned);
+
+		/* then build the stiffness matrix A */
+		build_sparse_stiffness_matrix_B(pmesh, B, Bformer, DIM, newsize, RefArray, RefArrayBack);
+
+		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
+
+		fprintf(stderr, "Problem size : %dx%d (%dx%d)\n", newsize, newsize, DIM, DIM);
+
+		if (version < 3) {
+			dw_factoLU(A, newsize, newsize, nblocks, version);
+		}
+		else {
+			dw_factoLU_tag(A, newsize, newsize, nblocks);
+		}
+
+		display_stat_heat();
+
+		solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
+
+		starpu_shutdown();
+	}
+
+#ifdef OPENGL_RENDER
+	opengl_render(ntheta, nthick, result, pmesh, argc, argv);
+#endif
+
+	free(pmesh);
+	free(RefArray);
+	free(RefArrayBack);
+	free(Bformer);
+	free(result);
+
+	return 0;
+}

+ 70 - 0
examples/heat/heat.h

@@ -0,0 +1,70 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __HEAT_H__
+#define __HEAT_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+// needed for OPENGL_RENDER
+#include <starpu_config.h>
+#include <starpu.h>
+
+#include "../common/blas.h"
+
+
+#ifdef OPENGL_RENDER
+#include <GL/gl.h>
+#include <GL/glu.h>
+#include <GL/glut.h>
+#endif
+
+#define X	0
+#define Y	1
+
+#define DIM	ntheta*nthick
+
+#define RMIN	(150.0f)
+#define RMAX	(200.0f)
+
+#define Pi	(3.141592f)
+
+#define NODE_NUMBER(theta, thick)	((thick)+(theta)*nthick)
+#define NODE_TO_THICK(n)		((n) % nthick)
+#define NODE_TO_THETA(n)		((n) / nthick)
+
+//#define USE_POSTSCRIPT	1
+
+typedef struct point_t {
+	float x;
+	float y;
+} point;
+
+extern void dw_factoLU(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned version);
+extern void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks);
+extern void initialize_system(float **A, float **B, unsigned dim, unsigned pinned);
+
+void display_stat_heat(void);
+
+#ifdef OPENGL_RENDER
+extern void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_);
+#endif
+
+#endif // __HEAT_H__

+ 283 - 0
examples/heat/heat_display.c

@@ -0,0 +1,283 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "heat.h"
+
+#ifdef OPENGL_RENDER
+/*
+ * Just some dummy OpenGL code to display our results 
+ *
+ */
+
+static float minval, maxval;
+
+static unsigned ntheta;
+static unsigned nthick;
+static float *result;
+static unsigned printmesh =0;
+static point *pmesh;
+
+
+float xmin, xmax, ymin, ymax;
+float xcenter, ycenter;
+
+static void generate_graph(void)
+{
+	unsigned theta, thick;
+
+	for (theta = 0; theta < ntheta-1; theta++)
+	{
+		for (thick = 0; thick < nthick-1; thick++)
+		{
+			unsigned nodeA = NODE_NUMBER(theta, thick);
+			unsigned nodeB = NODE_NUMBER(theta, thick+1);
+			unsigned nodeC = NODE_NUMBER(theta+1, thick+1);
+			unsigned nodeD = NODE_NUMBER(theta+1, thick);
+
+			float colorA_R, colorB_R, colorC_R, colorD_R;
+			float colorA_G, colorB_G, colorC_G, colorD_G;
+			float colorA_B, colorB_B, colorC_B, colorD_B;
+
+			if (maxval == minval) {
+				colorA_R = 1.0f; colorA_G = 1.0f; colorA_B = 1.0f;
+				colorB_R = 1.0f; colorB_G = 1.0f; colorB_B = 1.0f;
+				colorC_R = 1.0f; colorC_G = 1.0f; colorC_B = 1.0f;
+				colorD_R = 1.0f; colorD_G = 1.0f; colorD_B = 1.0f;
+			}
+			else {
+				float amplitude = maxval - minval;
+
+				float coeffA, coeffB, coeffC, coeffD;
+
+				coeffA = (result[nodeA] - minval)/amplitude;
+				coeffB = (result[nodeB] - minval)/amplitude;
+				coeffC = (result[nodeC] - minval)/amplitude;
+				coeffD = (result[nodeD] - minval)/amplitude;
+
+				colorA_R = coeffA>0.5f?1.0f:(2.0*coeffA)*1.0f; 
+				colorB_R = coeffB>0.5f?1.0f:(2.0*coeffB)*1.0f; 
+				colorC_R = coeffC>0.5f?1.0f:(2.0*coeffC)*1.0f; 
+				colorD_R = coeffD>0.5f?1.0f:(2.0*coeffD)*1.0f; 
+
+				colorA_B = 0.0f; 
+				colorB_B = 0.0f; 
+				colorC_B = 0.0f; 
+				colorD_B = 0.0f; 
+
+				colorA_G = coeffA<0.5f?1.0f:2.0*(1 - coeffA)*1.0f;
+				colorB_G = coeffB<0.5f?1.0f:2.0*(1 - coeffB)*1.0f;
+				colorC_G = coeffC<0.5f?1.0f:2.0*(1 - coeffC)*1.0f;
+				colorD_G = coeffD<0.5f?1.0f:2.0*(1 - coeffD)*1.0f;
+			}
+
+			if (printmesh) {
+				glColor3f (0.0f, 0.0f, 0.0f);
+				glPolygonMode(GL_FRONT_AND_BACK, GL_LINE);
+				glLineWidth(3.0f);
+				glBegin(GL_POLYGON);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 2.0f);
+				glVertex3f(pmesh[nodeD].x, pmesh[nodeD].y, 2.0f);
+				glVertex3f(pmesh[nodeC].x, pmesh[nodeC].y, 2.0f);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 2.0f);
+				glEnd();
+
+				glBegin(GL_POLYGON);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 1.0f);
+				glVertex3f(pmesh[nodeC].x, pmesh[nodeC].y, 1.0f);
+				glVertex3f(pmesh[nodeB].x, pmesh[nodeB].y, 1.0f);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 1.0f);
+				glEnd();
+			}
+
+			glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+			glBegin(GL_POLYGON);
+   				glColor3f (colorA_R, colorA_G, colorA_B);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 0.0f);
+   				glColor3f (colorD_R, colorD_G, colorD_B);
+				glVertex3f(pmesh[nodeD].x, pmesh[nodeD].y, 0.0f);
+   				glColor3f (colorC_R, colorC_G, colorC_B);
+				glVertex3f(pmesh[nodeC].x, pmesh[nodeC].y, 0.0f);
+			glEnd();
+
+			glBegin(GL_POLYGON);
+   				glColor3f (colorA_R, colorA_G, colorA_B);
+				glVertex3f(pmesh[nodeA].x, pmesh[nodeA].y, 0.0f);
+   				glColor3f (colorC_R, colorC_G, colorC_B);
+				glVertex3f(pmesh[nodeC].x, pmesh[nodeC].y, 0.0f);
+   				glColor3f (colorB_R, colorB_G, colorB_B);
+				glVertex3f(pmesh[nodeB].x, pmesh[nodeB].y, 0.0f);
+			glEnd();
+		}
+	}
+}
+
+static void display(void)
+{
+	glClear (GL_COLOR_BUFFER_BIT);
+	glLoadIdentity ();             /* clear the matrix */
+	float amplitude = STARPU_MAX(xmax - xmin, ymax - ymin);
+	float factor = 1.0/amplitude;
+	glScalef (factor, factor, factor);      /* modeling transformation */
+	gluLookAt (xcenter, ycenter, 30.0f, xcenter, ycenter, 0.0f, 0.0f, 1.0f, 0.0f);
+//	printf("factor %f\n", factor);
+	//   glRotatef(-0,0.0,0.0,0.0);
+	generate_graph();
+	glFlush ();
+}
+
+
+static void pressKey(unsigned char key, int x __attribute__ ((unused)), int y  __attribute__ ((unused)))
+{
+	switch (key) {
+		case 'q':
+			exit(0);
+		default:
+			printmesh = !printmesh;
+			display();
+			break;
+	}
+}
+
+
+
+static void reshape (int w, int h)
+{
+	glViewport (0, 0, (GLsizei) w, (GLsizei) h);
+	glMatrixMode (GL_PROJECTION);
+	glLoadIdentity ();
+	glFrustum (xmin, xmax, ymin, ymax, 5.0f, 5.0f);
+	glMatrixMode (GL_MODELVIEW);
+}
+
+
+void find_limits(void)
+{
+	minval = 100000000.0f;
+	maxval = -10000000.0f;
+
+	unsigned i;
+	for (i = 0; i < DIM; i++)
+	{
+		/* find min */
+		minval = STARPU_MIN(result[i], minval);
+
+		/* find max */
+		maxval = STARPU_MAX(result[i], maxval);
+	}
+
+	xmin = 10000000.0f;
+	xmax = -10000000.0f;
+	ymin = 10000000.0f;
+	ymax = -10000000.0f;
+
+	unsigned theta, thick;
+	for (theta = 0; theta < ntheta; theta++)
+	{
+		for (thick = 0; thick < nthick; thick++)
+		{
+			point *p = &pmesh[NODE_NUMBER(theta, thick)];
+
+			if (p->x < xmin)
+				xmin = p->x;
+
+			if (p->x > xmax)
+				xmax = p->x;
+
+			if (p->y < ymin)
+				ymin = p->y;
+
+			if (p->y > ymax)
+				ymax = p->y;
+		}
+	}
+	
+	ycenter = (ymin + ymax)/2;
+	xcenter = (xmin + xmax)/2;
+}
+
+void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_pmesh, int argc_, char **argv_)
+{
+	fprintf(stderr, "OpenGL rendering ... \n");
+
+	ntheta = _ntheta;
+	nthick = _nthick;
+	result = _result;
+	printmesh = 0;
+	pmesh = _pmesh;
+
+	find_limits();
+
+	glutInit(&argc_, argv_);
+	glutInitDisplayMode (GLUT_SINGLE | GLUT_RGB);
+	glutInitWindowSize (800, 800);
+	glutInitWindowPosition (100, 100);
+	glutCreateWindow ("Temperature");
+
+	/* init */
+	glClearColor (0.0, 0.0, 0.0, 0.0);
+	glShadeModel (GL_MODELVIEW);
+
+	glutKeyboardFunc(pressKey);
+	glutDisplayFunc(display);
+	glutReshapeFunc(reshape);
+	glutMainLoop();
+}
+#endif // OPENGL_RENDER
+
+#ifdef USE_POSTSCRIPT
+static void postscript_gen(void)
+{
+	FILE *psfile;
+	psfile = fopen("output.ps", "w+");
+
+	int offx, offy;
+	unsigned theta, thick;
+
+	offx = RMAX+50;
+	offy = 100;
+
+	for (theta = 0; theta < ntheta-1; theta++)
+	{
+		for (thick = 0; thick < nthick-1; thick++)
+		{
+			fprintf(psfile, "newpath\n");
+			fprintf(psfile, "%d %d moveto\n", (int)pmesh[NODE_NUMBER(theta, thick)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta, thick)].y+ offy);
+			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta+1, thick)].y+ offy);
+			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick+1)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta+1, thick+1)].y+ offy);
+			fprintf(psfile, "closepath\n");
+			fprintf(psfile, "stroke\n");
+
+			fprintf(psfile, "newpath\n");
+			fprintf(psfile, "%d %d moveto\n", (int)pmesh[NODE_NUMBER(theta, thick)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta, thick)].y+ offy);
+			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta, thick+1)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta, thick+1)].y+ offy);
+			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick+1)].x + offx,
+					(int)pmesh[NODE_NUMBER(theta+1, thick+1)].y+ offy);
+			fprintf(psfile, "closepath\n");
+
+			fprintf(psfile, "stroke\n");
+		}
+	}
+
+	fclose(psfile);
+}
+#endif
+
+

+ 277 - 0
examples/heat/lu_kernels_model.c

@@ -0,0 +1,277 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "lu_kernels_model.h"
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+//#define USE_PERTURBATION	1
+
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+/* 
+ *
+ *	Generic models
+ *
+ */
+
+double task_11_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/537.5);
+
+	return PERTURBATE(cost);
+}
+
+double task_12_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+//	double cost = ((n*n*n)/1744.695);
+	double cost = ((n*n*n)/3210.80);
+
+	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+double task_21_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+//	double cost = ((n*n*n)/1744.695);
+	double cost = ((n*n*n)/3691.53);
+
+	//fprintf(stderr, "task 12 predicts %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+
+double task_22_cost(starpu_buffer_descr *descr)
+{
+	uint32_t nx, ny, nz;
+
+	nx = starpu_get_blas_nx(descr[2].state);
+	ny = starpu_get_blas_ny(descr[2].state);
+	nz = starpu_get_blas_ny(descr[0].state);
+
+	double cost = ((nx*ny*nz)/4110.0);
+
+	return PERTURBATE(cost);
+}
+
+/* 
+ *
+ *	Models for CUDA
+ *
+ */
+
+
+double task_11_cost_cuda(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/1853.7806);
+
+//	printf("CUDA task 11 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+double task_12_cost_cuda(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/42838.5718);
+
+//	printf("CUDA task 12 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+double task_21_cost_cuda(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/49208.667);
+
+//	printf("CUDA task 21 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+
+double task_22_cost_cuda(starpu_buffer_descr *descr)
+{
+	uint32_t nx, ny, nz;
+
+	nx = starpu_get_blas_nx(descr[2].state);
+	ny = starpu_get_blas_ny(descr[2].state);
+	nz = starpu_get_blas_ny(descr[0].state);
+
+	double cost = ((nx*ny*nz)/57523.560);
+
+//	printf("CUDA task 22 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+/* 
+ *
+ *	Models for CPUs
+ *
+ */
+
+double task_11_cost_core(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/537.5);
+
+//	printf("CORE task 11 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+double task_12_cost_core(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/6668.224);
+
+//	printf("CORE task 12 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+double task_21_cost_core(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = ((n*n*n)/6793.8423);
+
+//	printf("CORE task 21 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+
+
+double task_22_cost_core(starpu_buffer_descr *descr)
+{
+	uint32_t nx, ny, nz;
+
+	nx = starpu_get_blas_nx(descr[2].state);
+	ny = starpu_get_blas_ny(descr[2].state);
+	nz = starpu_get_blas_ny(descr[0].state);
+
+	double cost = ((nx*ny*nz)/4203.0175);
+
+//	printf("CORE task 22 ; predict %e\n", cost);
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel_t model_11 = {
+	.cost_model = task_11_cost,
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = task_11_cost_core },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = task_11_cost_cuda }
+	},
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = "lu_model_11_atlas"
+#elif defined(GOTO)
+	.symbol = "lu_model_11_goto"
+#else
+	.symbol = "lu_model_11"
+#endif
+};
+
+struct starpu_perfmodel_t model_12 = {
+	.cost_model = task_12_cost,
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = task_12_cost_core },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = task_12_cost_cuda }
+	},
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = "lu_model_12_atlas"
+#elif defined(GOTO)
+	.symbol = "lu_model_12_goto"
+#else
+	.symbol = "lu_model_12"
+#endif
+};
+
+struct starpu_perfmodel_t model_21 = {
+	.cost_model = task_21_cost,
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = task_21_cost_core },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = task_21_cost_cuda }
+	},
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = "lu_model_21_atlas"
+#elif defined(GOTO)
+	.symbol = "lu_model_21_goto"
+#else
+	.symbol = "lu_model_21"
+#endif
+};
+
+struct starpu_perfmodel_t model_22 = {
+	.cost_model = task_22_cost,
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = task_22_cost_core },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = task_22_cost_cuda }
+	},
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = "lu_model_22_atlas"
+#elif defined(GOTO)
+	.symbol = "lu_model_22_goto"
+#else
+	.symbol = "lu_model_22"
+#endif
+};

+ 22 - 0
examples/heat/lu_kernels_model.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __LU_KERNELS_MODEL_H__
+#define __LU_KERNELS_MODEL_H__
+
+#include <starpu.h>
+
+#endif // __LU_KERNELS_MODEL_H__

+ 39 - 0
examples/incrementer/Makefile.in

@@ -0,0 +1,39 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+ifeq (@USE_CUDA@,yes)
+        EXTRADEP += ../cuda/incrementer_cuda.cubin
+endif
+
+all: $(EXTRADEP) incrementer
+
+../cuda/incrementer_cuda.cubin:
+	make -C ../cuda incrementer_cuda.cubin
+
+incrementer.o: incrementer.c
+	$(CC) $(CFLAGS) incrementer.c -c -o incrementer.o
+
+incrementer: $(STARPU) incrementer.o
+	$(CC) incrementer.o -o incrementer $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f incrementer

+ 178 - 0
examples/incrementer/incrementer.c

@@ -0,0 +1,178 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+
+/* for USE_CUDA */
+#include <starpu_config.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+#include <starpu.h>
+
+#define NITER	50000
+
+starpu_data_handle my_float_state;
+starpu_data_handle unity_state;
+
+sem_t sem;
+
+unsigned size __attribute__ ((aligned (16))) = 4*sizeof(float);
+
+float my_lovely_float[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f, 1664.0f}; 
+float unity[4] __attribute__ ((aligned (16))) = { 1.0f, 0.0f, 1.0f, 0.0f };
+unsigned i;
+
+void callback_func(void *argcb)
+{
+	unsigned cnt = STARPU_ATOMIC_ADD((unsigned *)argcb, 1);
+
+	if (cnt == NITER) 
+	{
+		sem_post(&sem);
+	}
+}
+
+void core_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)buffers[0].vector.ptr;
+
+	val[0] += 1.0f; val[1] += 1.0f;
+}
+
+#ifdef USE_CUDA
+void cublas_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)buffers[0].vector.ptr;
+	float *dunity = (float *)buffers[1].vector.ptr;
+
+	cublasSaxpy(3, 1.0f, dunity, 1, val, 1);
+}
+#endif
+
+#ifdef USE_CUDA
+static struct starpu_cuda_module_s cuda_module;
+static struct starpu_cuda_function_s cuda_function;
+
+static starpu_cuda_codelet_t cuda_codelet;
+
+void initialize_cuda(void)
+{
+	char module_path[1024];
+	sprintf(module_path, 
+		"%s/examples/cuda/incrementer_cuda.cubin", STARPUDIR);
+	char *function_symbol = "cuda_incrementer";
+
+	starpu_init_cuda_module(&cuda_module, module_path);
+	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
+
+	cuda_codelet.func = &cuda_function;
+
+	cuda_codelet.gridx = 1;
+	cuda_codelet.gridy = 1;
+
+	cuda_codelet.blockx = 1;
+	cuda_codelet.blocky = 1;
+
+	cuda_codelet.shmemsize = 1024;
+}
+#endif
+
+void init_data(void)
+{
+	starpu_monitor_vector_data(&my_float_state, 0 /* home node */, (uintptr_t)&my_lovely_float, 4, sizeof(float));
+
+	starpu_monitor_vector_data(&unity_state, 0 /* home node */, (uintptr_t)&unity, 4, sizeof(float));
+}
+
+int main(__attribute__ ((unused)) int argc, __attribute__ ((unused)) char **argv)
+{
+	unsigned counter = 0;
+
+	starpu_init();
+	fprintf(stderr, "StarPU initialized ...\n");
+
+	sem_init(&sem, 0, 0);
+
+	init_data();
+
+#ifdef USE_CUDA
+	initialize_cuda();
+#endif
+
+	starpu_codelet cl =
+	{
+		.core_func = core_codelet,
+		.where = CORE|CUDA|GORDON,
+#ifdef USE_CUDA
+		.cuda_func = &cuda_codelet,
+#endif
+#ifdef USE_GORDON
+		.gordon_func = SPU_FUNC_ADD,
+#endif
+		.nbuffers = 2
+	};
+
+	for (i = 0; i < NITER; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		task->cl = &cl;
+		
+		task->callback_func = callback_func;
+		task->callback_arg = &counter;
+
+		task->cl_arg = &size;
+		task->cl_arg_size = sizeof(unsigned);
+
+		task->buffers[0].state = my_float_state;
+		task->buffers[0].mode = RW;
+		task->buffers[1].state = unity_state; 
+		task->buffers[1].mode = R;
+
+		task->use_tag = 0;
+		task->synchronous = 0;
+
+		starpu_submit_task(task);
+	}
+
+	sem_wait(&sem);
+
+//	/* stop monitoring data and grab it in RAM */
+//	unpartition_data(&my_float_state, 0);
+//
+//	delete_data(&my_float_state);
+
+	starpu_sync_data_with_mem(my_float_state);
+	
+	printf("array -> %f, %f, %f\n", my_lovely_float[0], 
+			my_lovely_float[1], my_lovely_float[2]);
+//	printf("stopping ... cnt was %d i %d\n", cnt, i);
+	
+	if (my_lovely_float[0] != my_lovely_float[1] + my_lovely_float[2])
+		return 1;
+	
+	starpu_shutdown();
+
+	return 0;
+}

+ 46 - 0
examples/mult/Makefile.in

@@ -0,0 +1,46 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+BLASOBJ=../common/blas.o ../common/blas_model.o
+
+all: dw_mult dw_mult_no_stride dw_mult_no_stride_no_tag
+
+dw_mult.o: dw_mult.c dw_mult.h
+	$(CC) $(CFLAGS) dw_mult.c -c -o dw_mult.o
+
+dw_mult_no_stride.o: dw_mult_no_stride.c dw_mult.h
+	$(CC) $(CFLAGS) dw_mult_no_stride.c -c -o dw_mult_no_stride.o
+
+dw_mult_no_stride_no_tag.o: dw_mult_no_stride_no_tag.c dw_mult.h
+	$(CC) $(CFLAGS) dw_mult_no_stride_no_tag.c -c -o dw_mult_no_stride_no_tag.o
+
+dw_mult: $(BLASOBJ) dw_mult.o
+	$(CC) $(BLASOBJ) dw_mult.o -o dw_mult $(LDFLAGS) $(LIBS)
+
+dw_mult_no_stride: $(BLASOBJ) dw_mult_no_stride.o
+	$(CC) $(BLASOBJ) dw_mult_no_stride.o -o dw_mult_no_stride $(LDFLAGS) $(LIBS)
+
+dw_mult_no_stride_no_tag: $(BLASOBJ) dw_mult_no_stride_no_tag.o
+	$(CC) $(BLASOBJ) dw_mult_no_stride_no_tag.o -o dw_mult_no_stride_no_tag $(LDFLAGS) $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f dw_mult dw_mult_no_stride dw_mult_no_stride_no_tag

+ 332 - 0
examples/mult/dw_mult.c

@@ -0,0 +1,332 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_mult.h"
+
+float *A, *B, *C;
+starpu_data_handle A_state, B_state, C_state;
+
+/*
+ * That program should compute C = A * B 
+ * 
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+
+              |---------------|
+            z |       B       |
+              |---------------|
+       z              x
+     |----|   |---------------|
+     |    |   |               |
+     |    |   |               |
+     | A  | y |       C       |
+     |    |   |               |
+     |    |   |               |
+     |----|   |---------------|
+
+ */
+
+void terminate(void)
+{
+
+	fprintf(stderr, "unpartition !!\n");
+	starpu_unpartition_data(C_state, 0);
+
+	starpu_delete_data(C_state);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim);
+	uint64_t total_ls = ls_cublas + ls_atlas;
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
+	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
+	fprintf(stderr, "	GB : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_ls/1000000000.0f, (double)ls_cublas/1000000000.0f, (double)ls_atlas/1000000000.0f);
+	fprintf(stderr, "	GB/s : %2.2f\n", (double)total_ls / (double)timing/1000);
+
+#ifdef CHECK_OUTPUT
+	/* check results */
+	/* compute C = C - AB */
+
+	SGEMM("N", "N", ydim, xdim, zdim, -1.0f, A, ydim, B, zdim, 1.0f, C, ydim);
+		
+	/* make sure C = 0 */
+	float err;
+	err = SASUM(xdim*ydim, C, 1);	
+	
+	if (err < xdim*ydim*0.001) {
+		fprintf(stderr, "Results are OK\n");
+	}
+	else {
+		fprintf(stderr, "There were errors ... err = %f\n", err);
+	}
+#endif // CHECK_OUTPUT
+
+	sem_post(&sem);
+}
+
+void callback_func(void *arg)
+{
+	/* the argument is a pointer to a counter of the remaining tasks */
+	int *counterptr = arg;
+
+	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
+	if (counter == 0)
+	{
+		/* we are done */	
+		fprintf(stderr, "done ...\n");
+		terminate();
+	}
+
+	return;
+}
+
+
+#define COMMON_CODE			\
+	uint32_t nxC, nyC, nyA;		\
+	uint32_t ldA, ldB, ldC;		\
+					\
+	float *subA;			\
+	float *subB;			\
+	float *subC;			\
+					\
+	subA = (float *)descr[0].blas.ptr;	\
+	subB = (float *)descr[1].blas.ptr;	\
+	subC = (float *)descr[2].blas.ptr;	\
+					\
+	nxC = descr[2].blas.nx;		\
+	nyC = descr[2].blas.ny;		\
+	nyA = descr[0].blas.ny;		\
+					\
+	ldA = descr[0].blas.ld;		\
+	ldB = descr[1].blas.ld;		\
+	ldC = descr[2].blas.ld;
+
+
+
+#ifdef USE_CUDA
+void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	COMMON_CODE
+
+	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
+					     0.0f, subC, ldC);
+	cublasStatus st;
+	st = cublasGetError();
+	if (st != CUBLAS_STATUS_SUCCESS)
+		STARPU_ASSERT(0);
+
+	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
+
+	flop_cublas += flopcnt;
+	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
+}
+#endif
+
+void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	COMMON_CODE
+
+	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
+
+	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
+	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
+}
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+#ifdef USE_CUDA
+	if (pin) {
+		starpu_malloc_pinned_if_possible(&A, zdim*ydim*sizeof(float));
+		starpu_malloc_pinned_if_possible(&B, xdim*zdim*sizeof(float));
+		starpu_malloc_pinned_if_possible(&C, xdim*ydim*sizeof(float));
+	} else
+#endif
+	{
+		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
+		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
+		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
+	}
+
+	/* fill the A and B matrices */
+	if (norandom) {
+		for (j=0; j < ydim; j++) {
+			for (i=0; i < zdim; i++) {
+				A[j+i*ydim] = (float)(i);
+			}
+		}
+	
+		for (j=0; j < zdim; j++) {
+			for (i=0; i < xdim; i++) {
+				B[j+i*zdim] = (float)(j);
+			}
+		}
+	} 
+	else {
+#ifdef NORANDOM
+		srand(2008);
+		STARPU_ASSERT(0);
+#endif
+		for (j=0; j < ydim; j++) {
+			for (i=0; i < zdim; i++) {
+				A[j+i*ydim] = (float)(drand48());
+			}
+		}
+	
+		for (j=0; j < zdim; j++) {
+			for (i=0; i < xdim; i++) {
+				B[j+i*zdim] = (float)(drand48());
+			}
+		}
+	}
+
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < xdim; i++) {
+			C[j+i*ydim] = (float)(0);
+		}
+	}
+
+	display_memory_consumption();
+}
+
+static void partition_mult_data(void)
+{
+	gettimeofday(&start, NULL);
+
+	starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, 
+		ydim, ydim, zdim, sizeof(float));
+	starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, 
+		zdim, zdim, xdim, sizeof(float));
+	starpu_monitor_blas_data(&C_state, 0, (uintptr_t)C, 
+		ydim, ydim, xdim, sizeof(float));
+
+	conf.k = zdim;
+	conf.m = ydim/nslicesy;
+	conf.n = xdim/nslicesx;
+
+	starpu_filter f;
+	f.filter_func = starpu_vertical_block_filter_func;
+	f.filter_arg = nslicesx;
+		
+	starpu_filter f2;
+	f2.filter_func = starpu_block_filter_func;
+	f2.filter_arg = nslicesy;
+		
+	starpu_partition_data(B_state, &f);
+	starpu_partition_data(A_state, &f2);
+
+	starpu_map_filters(C_state, 2, &f, &f2);
+}
+
+static void launch_codelets(void)
+{
+#ifdef USE_FXT
+	fxt_register_thread(0);
+#endif
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	taskcounter = nslicesx * nslicesy;
+
+	srand(time(NULL));
+
+	starpu_codelet cl = {
+		.where = CORE|CUBLAS|GORDON,
+		.core_func = core_mult,
+#ifdef USE_CUDA
+		.cublas_func = cublas_mult,
+#endif
+#ifdef USE_GORDON
+		.gordon_func = SPU_FUNC_SGEMM,
+#endif
+		.nbuffers = 3
+	};
+
+	for (taskx = 0; taskx < nslicesx; taskx++) 
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			/* A B[task] = C[task] */
+			struct starpu_task *task = starpu_task_create();
+
+			task->cl = &cl;
+			task->cl_arg = &conf;
+			task->cl_arg_size = sizeof(struct block_conf);
+
+			task->callback_func = callback_func;
+			task->callback_arg = &taskcounter;
+
+			starpu_tag_t tag = 
+				((((unsigned long long)(taskx))<<32) 
+				| (unsigned long long)(tasky));
+
+			task->use_tag = 1;
+			task->tag_id = tag;
+
+			task->buffers[0].state = get_sub_data(A_state, 1, tasky);
+			task->buffers[0].mode = R;
+			task->buffers[1].state = get_sub_data(B_state, 1, taskx);
+			task->buffers[1].mode = R;
+			task->buffers[2].state = 
+				get_sub_data(C_state, 2, taskx, tasky);
+			task->buffers[2].mode = RW;
+
+			if (use_common_model)
+			{
+				task->cl->model = &sgemm_model_common;
+			}
+			else
+			{
+				task->cl->model = &sgemm_model;
+			}
+			
+			starpu_submit_task(task);
+
+		}
+	}
+}
+
+int main(__attribute__ ((unused)) int argc, 
+	 __attribute__ ((unused)) char **argv)
+{
+
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+	init_problem_data();
+
+	partition_mult_data();
+
+	launch_codelets();
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 169 - 0
examples/mult/dw_mult.h

@@ -0,0 +1,169 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MULT_H__
+#define __MULT_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <common/blas.h>
+#include <common/blas_model.h>
+
+#include <starpu_config.h>
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+#define MAXSLICESX	32
+#define MAXSLICESY	32
+#define MAXSLICESZ	32
+
+#define BLAS3_FLOP(n1,n2,n3)	\
+	(2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+#define BLAS3_LS(n1,n2,n3)    \
+	((2*(n1)*(n3) + (n1)*(n2) + (n2)*(n3))*sizeof(float))
+
+struct block_conf {
+	uint32_t m;
+	uint32_t n;
+	uint32_t k;
+	uint32_t pad;
+};
+
+#define NITER	100
+
+unsigned niter = NITER;
+unsigned nslicesx = 4;
+unsigned nslicesy = 4;
+unsigned nslicesz = 4;
+unsigned xdim = 256;
+unsigned ydim = 256;
+unsigned zdim = 64;
+unsigned norandom = 0;
+unsigned pin = 0;
+unsigned use_common_model = 0;
+
+/* to compute MFlop/s */
+uint64_t flop_cublas = 0;
+uint64_t flop_atlas = 0;
+
+/* to compute MB/s (load/store) */
+uint64_t ls_cublas = 0;
+uint64_t ls_atlas = 0;
+
+
+struct timeval start;
+struct timeval end;
+sem_t sem;
+
+static int taskcounter __attribute__ ((unused));
+static struct block_conf conf __attribute__ ((aligned (128)));
+
+#define BLOCKSIZEX	(xdim / nslicesx)
+#define BLOCKSIZEY	(ydim / nslicesy)
+#define BLOCKSIZEZ	(zdim / nslicesz)
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+			nslicesy = nslicesx;
+			nslicesz = nslicesx;
+		}
+
+		if (strcmp(argv[i], "-nblocksx") == 0) {
+			char *argptr;
+			nslicesx = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocksy") == 0) {
+			char *argptr;
+			nslicesy = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocksz") == 0) {
+			char *argptr;
+			nslicesz = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-x") == 0) {
+			char *argptr;
+			xdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-y") == 0) {
+			char *argptr;
+			ydim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-z") == 0) {
+			char *argptr;
+			zdim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-iter") == 0) {
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+
+
+		if (strcmp(argv[i], "-no-random") == 0) {
+			norandom = 1;
+		}
+
+		if (strcmp(argv[i], "-pin") == 0) {
+			pin = 1;
+		}
+
+		if (strcmp(argv[i], "-common-model") == 0) {
+			use_common_model = 1;
+		}
+	}
+
+	assert(nslicesx <= MAXSLICESX); 
+	assert(nslicesy <= MAXSLICESY); 
+	assert(nslicesz <= MAXSLICESZ); 
+}
+
+static void display_memory_consumption(void)
+{
+	fprintf(stderr, "Total memory : %ld MB\n",
+		(MAXSLICESY*MAXSLICESZ*sizeof(float *) 
+		+ MAXSLICESZ*MAXSLICESX*sizeof(float *)
+		+ MAXSLICESY*MAXSLICESX*sizeof(float *)
+		+ MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle)
+		+ MAXSLICESZ*MAXSLICESX*sizeof(starpu_data_handle)
+		+ MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle)
+		+ ydim*zdim*sizeof(float)
+		+ zdim*xdim*sizeof(float)
+		+ ydim*xdim*sizeof(float))/(1024*1024) );
+}
+
+
+#endif // __MULT_H__

+ 461 - 0
examples/mult/dw_mult_no_stride.c

@@ -0,0 +1,461 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_mult.h"
+
+float *A[MAXSLICESY][MAXSLICESZ];
+float *B[MAXSLICESZ][MAXSLICESX];
+float *C[MAXSLICESY][MAXSLICESX];
+
+starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
+starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
+starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
+
+/* fortran ordering ... */
+#define FULLA(i,j)	\
+	(A[(i)/BLOCKSIZEY][(j)/BLOCKSIZEZ][(i)%BLOCKSIZEY + ((j)%BLOCKSIZEZ)*BLOCKSIZEY])
+
+#define FULLB(i,j)	\
+	(B[(i)/BLOCKSIZEZ][(j)/BLOCKSIZEX][(i)%BLOCKSIZEZ + ((j)%BLOCKSIZEX)*BLOCKSIZEZ])
+
+#define FULLC(i,j)	\
+	(C[(i)/BLOCKSIZEY][(j)/BLOCKSIZEX][(i)%BLOCKSIZEY + ((j)%BLOCKSIZEX)*BLOCKSIZEY])
+
+#define TAG(x,y,z,iter)	\
+		((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter))
+
+static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
+
+/*
+ * That program should compute C = A * B 
+ * 
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+
+              |---------------|
+            z |       B       |
+              |---------------|
+       z              x
+     |----|   |---------------|
+     |    |   |               |
+     |    |   |               |
+     | A  | y |       C       |
+     |    |   |               |
+     |    |   |               |
+     |----|   |---------------|
+
+ */
+
+static void terminate(void)
+{
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim)*niter;
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
+	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
+
+	sem_post(&sem);
+}
+
+
+#define COMMON_CODE			\
+	uint32_t nxC, nyC, nyA;		\
+	uint32_t ldA, ldB, ldC;		\
+					\
+	float *subA;			\
+	float *subB;			\
+	float *subC;			\
+					\
+	subA = (float *)descr[0].blas.ptr;	\
+	subB = (float *)descr[1].blas.ptr;	\
+	subC = (float *)descr[2].blas.ptr;	\
+					\
+	nxC = descr[2].blas.nx;		\
+	nyC = descr[2].blas.ny;		\
+	nyA = descr[0].blas.ny;		\
+					\
+	ldA = descr[0].blas.ld;		\
+	ldB = descr[1].blas.ld;		\
+	ldC = descr[2].blas.ld;
+
+
+
+#ifdef USE_CUDA
+static void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	COMMON_CODE
+
+	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
+					     1.0f, subC, ldC);
+	cublasStatus st;
+	st = cublasGetError();
+	if (st != CUBLAS_STATUS_SUCCESS)
+		STARPU_ASSERT(0);
+
+	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
+
+	flop_cublas += flopcnt;
+	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
+}
+#endif
+
+static void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	COMMON_CODE
+
+//	fprintf(stderr, "Call SGEMM : nxC %d nyC %d nyA %d subA %p ldA %d subB %p ldB %d subC %p ldC %d\n",
+//				nxC, nyC, nyA, subA, ldA, subB, ldB, subC, ldC);
+	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 1.0f, subC, ldC);
+
+	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
+	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
+}
+
+#define MEM_ALIGNMENT	16
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+	/* debug ... */
+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(float *));
+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(float *));
+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(float *));
+	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
+
+	/* Allocate grids of buffer */
+	/* TODO pin ... */
+	unsigned z, y, x;
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			assert(A[y][z]);
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			assert(B[z][x]);
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			assert(C[y][x]);
+		}
+	}
+	
+	/* fill the A and B matrices */
+	unsigned blockx, blocky, blockz;
+
+	if (norandom) {
+		for (blocky = 0; blocky < nslicesy; blocky++)
+			for (blockz = 0; blockz < nslicesz; blockz++)
+				for (j = 0; j < BLOCKSIZEY; j++)
+					for (i = 0; i < BLOCKSIZEZ; i++)
+					{
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
+					}
+
+		for (blockz = 0; blockz < nslicesz; blockz++)
+			for (blockx = 0; blockx < nslicesx; blockx++)
+				for (j = 0; j < BLOCKSIZEZ; j++)
+					for (i = 0; i < BLOCKSIZEX; i++)
+					{
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
+					}
+	} 
+	else {
+		for (blocky = 0; blocky < nslicesy; blocky++)
+			for (blockz = 0; blockz < nslicesz; blockz++)
+				for (j = 0; j < BLOCKSIZEY; j++)
+					for (i = 0; i < BLOCKSIZEZ; i++)
+					{
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(drand48());
+					}
+
+		for (blockz = 0; blockz < nslicesz; blockz++)
+			for (blockx = 0; blockx < nslicesx; blockx++)
+				for (j = 0; j < BLOCKSIZEZ; j++)
+					for (i = 0; i < BLOCKSIZEX; i++)
+					{
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(drand48());
+					}
+
+	}
+
+	for (blocky = 0; blocky < nslicesy; blocky++)
+		for (blockx = 0; blockx < nslicesx; blockx++)
+			for (j = 0; j < BLOCKSIZEY; j++)
+				for (i = 0; i < BLOCKSIZEX; i++)
+				{
+					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)(blockx + blocky*nslicesx + 1);
+				}
+
+
+	/* declare the StarPU data to monitor */
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+			starpu_monitor_blas_data(&A_state[y][z], 0, (uintptr_t)A[y][z], 
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			starpu_monitor_blas_data(&B_state[z][x], 0, (uintptr_t)B[z][x], 
+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			starpu_monitor_blas_data(&C_state[y][x], 0, (uintptr_t)C[y][x], 
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
+		}
+	}
+
+	conf.k = BLOCKSIZEZ;
+	conf.m = BLOCKSIZEY;
+	conf.n = BLOCKSIZEX;
+
+	display_memory_consumption();
+}
+
+static void cleanup_problem(void)
+{
+	unsigned z, y, x;
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+	//		free(A[y][z]);
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+	//		free(B[z][x]);
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+	//		free(C[y][x]);
+			starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
+		}
+	}
+
+	
+	
+}
+
+int xycounter;
+
+struct cb2_s {
+	unsigned blockx;
+	unsigned blocky;
+	unsigned iter;
+	int *xycounter;
+};
+
+static starpu_codelet cl = {
+	.core_func = core_mult,
+#ifdef USE_CUDA
+	.cublas_func = cublas_mult,
+#endif
+#ifdef USE_GORDON
+	.gordon_func = SPU_FUNC_SGEMM,
+#endif
+	.where = CORE|CUBLAS|GORDON,
+	.nbuffers = 3
+};
+
+static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &cl;
+
+	task->cl_arg = &conf;
+	task->cl_arg_size = sizeof(struct block_conf);
+
+	task->use_tag = 1;
+	task->tag_id = TAG(z, y, x, iter);
+
+	task->buffers[0].state = A_state[y][z];
+	task->buffers[0].mode = R;
+	task->buffers[1].state = B_state[z][x];
+	task->buffers[1].mode = R;
+	task->buffers[2].state = C_state[y][x];
+	task->buffers[2].mode = RW;
+
+	return task;
+}
+
+
+static void callback_func(void *arg)
+{
+	/* the argument is a pointer to a counter of the remaining tasks */
+	int *counter = arg;
+	int newvalue = STARPU_ATOMIC_ADD(counter, -1);
+	if (newvalue == 0)
+	{
+		/* we are done */	
+		fprintf(stderr, "done ...\n");
+		terminate();
+	}
+
+	return;
+}
+
+
+static void callback_func_2(void *arg)
+{
+	/* the argument is a pointer to a counter of the remaining tasks */
+	struct cb2_s *cb2 = arg;
+	unsigned x,y,z,iter;
+
+	iter = cb2->iter;
+	x = cb2->blockx;
+	y = cb2->blocky;
+
+	free(cb2);
+
+//	fprintf(stderr, "func 2 for x %d y %d iter %d\n", x, y, iter);
+
+	/* TAG(nslicesz - 1, y, x, iter) remains ... */
+	for (z = 0; z < nslicesz - 1; z++)
+	{
+		starpu_tag_remove(TAG(z, y, x, iter));
+	}
+
+	if (iter > 0)
+	{
+		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
+	}
+	
+	if (iter == niter - 1) {
+		callback_func(&xycounter);
+	}
+	else {
+		submit_new_iter(x, y, iter+1);
+	}
+}
+
+
+
+static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
+{
+	unsigned z;
+	for (z = 0; z < nslicesz; z++) 
+	{
+		struct starpu_task *task;
+		task = construct_task(x, y, z, iter);
+		
+		if (z != 0) {
+			starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
+		}
+
+		if (z == nslicesz - 1) {
+			struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
+				cb2->blockx = x;
+				cb2->blocky = y;
+				cb2->iter = iter;
+				cb2->xycounter = &xycounter;
+			task->callback_func = callback_func_2;
+			task->callback_arg = cb2;
+		}
+
+		starpu_submit_task(task);
+	}
+}
+
+static void launch_codelets(void)
+{
+#ifdef USE_FXT
+	fxt_register_thread(0);
+#endif
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	/* only a callback per (nslicesz * niter) task given deps */
+	xycounter = nslicesx * nslicesy;
+
+	srand(time(NULL));
+
+	gettimeofday(&start, NULL);
+
+	for (taskx = 0; taskx < nslicesx; taskx++) 
+	for (tasky = 0; tasky < nslicesy; tasky++)
+	{
+		submit_new_iter(taskx, tasky, 0);
+	}
+}
+
+int main(__attribute__ ((unused)) int argc, 
+	 __attribute__ ((unused)) char **argv)
+{
+
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+	init_problem_data();
+
+	launch_codelets();
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	cleanup_problem();
+
+	exit(-1);
+	starpu_shutdown();
+
+	return 0;
+}

+ 467 - 0
examples/mult/dw_mult_no_stride_no_tag.c

@@ -0,0 +1,467 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_mult.h"
+
+struct pos {
+	unsigned x,y, z,iter;
+};
+
+struct pos currentpos [MAXSLICESY][MAXSLICESX];
+
+float *A[MAXSLICESY][MAXSLICESZ];
+float *B[MAXSLICESZ][MAXSLICESX];
+float *C[MAXSLICESY][MAXSLICESX];
+
+starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
+starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
+starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
+
+
+static void callback_func_3(void *arg);
+/*
+ * That program should compute C = A * B 
+ * 
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+
+              |---------------|
+            z |       B       |
+              |---------------|
+       z              x
+     |----|   |---------------|
+     |    |   |               |
+     |    |   |               |
+     | A  | y |       C       |
+     |    |   |               |
+     |    |   |               |
+     |----|   |---------------|
+
+ */
+
+static void terminate(void)
+{
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim)*niter;
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
+	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
+
+	sem_post(&sem);
+}
+
+
+#define COMMON_CODE			\
+	uint32_t nxC, nyC, nyA;		\
+	uint32_t ldA, ldB, ldC;		\
+					\
+	float *subA;			\
+	float *subB;			\
+	float *subC;			\
+					\
+	subA = (float *)descr[0].blas.ptr;	\
+	subB = (float *)descr[1].blas.ptr;	\
+	subC = (float *)descr[2].blas.ptr;	\
+					\
+	nxC = descr[2].blas.nx;		\
+	nyC = descr[2].blas.ny;		\
+	nyA = descr[0].blas.ny;		\
+					\
+	ldA = descr[0].blas.ld;		\
+	ldB = descr[1].blas.ld;		\
+	ldC = descr[2].blas.ld;
+
+
+
+#ifdef USE_CUDA
+static void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	COMMON_CODE
+
+	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
+					     1.0f, subC, ldC);
+	cublasStatus st;
+	st = cublasGetError();
+	if (st != CUBLAS_STATUS_SUCCESS)
+		STARPU_ASSERT(0);
+
+	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
+
+	flop_cublas += flopcnt;
+	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
+}
+#endif
+
+static void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	COMMON_CODE
+
+//	fprintf(stderr, "Call SGEMM : nxC %d nyC %d nyA %d subA %p ldA %d subB %p ldB %d subC %p ldC %d\n",
+//				nxC, nyC, nyA, subA, ldA, subB, ldB, subC, ldC);
+	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 1.0f, subC, ldC);
+
+	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
+	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
+}
+
+#define MEM_ALIGNMENT	16
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+	/* debug ... */
+	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(float *));
+	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(float *));
+	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(float *));
+	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
+	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
+
+	/* Allocate grids of buffer */
+	/* TODO pin ... */
+	unsigned z, y, x;
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+			posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
+			assert(A[y][z]);
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
+			assert(B[z][x]);
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
+			currentpos[y][x].x = x;
+			currentpos[y][x].y = y;
+			currentpos[y][x].z = 0;
+			currentpos[y][x].iter = 0;
+			assert(C[y][x]);
+		}
+	}
+	
+	/* fill the A and B matrices */
+	unsigned blockx, blocky, blockz;
+
+	if (norandom) {
+		for (blocky = 0; blocky < nslicesy; blocky++)
+			for (blockz = 0; blockz < nslicesz; blockz++)
+				for (j = 0; j < BLOCKSIZEY; j++)
+					for (i = 0; i < BLOCKSIZEZ; i++)
+					{
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
+					}
+
+		for (blockz = 0; blockz < nslicesz; blockz++)
+			for (blockx = 0; blockx < nslicesx; blockx++)
+				for (j = 0; j < BLOCKSIZEZ; j++)
+					for (i = 0; i < BLOCKSIZEX; i++)
+					{
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
+					}
+	} 
+	else {
+		for (blocky = 0; blocky < nslicesy; blocky++)
+			for (blockz = 0; blockz < nslicesz; blockz++)
+				for (j = 0; j < BLOCKSIZEY; j++)
+					for (i = 0; i < BLOCKSIZEZ; i++)
+					{
+						A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(drand48());
+					}
+
+		for (blockz = 0; blockz < nslicesz; blockz++)
+			for (blockx = 0; blockx < nslicesx; blockx++)
+				for (j = 0; j < BLOCKSIZEZ; j++)
+					for (i = 0; i < BLOCKSIZEX; i++)
+					{
+						B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(drand48());
+					}
+
+	}
+
+	for (blocky = 0; blocky < nslicesy; blocky++)
+		for (blockx = 0; blockx < nslicesx; blockx++)
+			for (j = 0; j < BLOCKSIZEY; j++)
+				for (i = 0; i < BLOCKSIZEX; i++)
+				{
+					C[blocky][blockx][i*BLOCKSIZEY + j] = (float)0;
+				}
+
+
+	/* declare the StarPU data to monitor */
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+			starpu_monitor_blas_data(&A_state[y][z], 0, (uintptr_t)A[y][z], 
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			starpu_monitor_blas_data(&B_state[z][x], 0, (uintptr_t)B[z][x], 
+				BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			starpu_monitor_blas_data(&C_state[y][x], 0, (uintptr_t)C[y][x], 
+				BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
+		}
+	}
+
+	conf.k = BLOCKSIZEZ;
+	conf.m = BLOCKSIZEY;
+	conf.n = BLOCKSIZEX;
+
+	display_memory_consumption();
+}
+
+static void cleanup_problem(void)
+{
+	unsigned z, y, x;
+
+#ifdef CHECK_OUTPUT
+	float maxerr = 0.0;
+	float err;
+	fprintf(stderr, "Checking results ....");
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+			for (z = 0; z < nslicesz; z++)
+			{
+				SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(float)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
+
+			}
+
+			/* make sure C - niter AB = 0 */
+			err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
+
+			if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001) 
+				fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
+
+			maxerr = STARPU_MAX(err, maxerr);
+		}
+	}
+
+	if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
+	{
+		fprintf(stderr, " maxerr = %f\n", maxerr/niter);
+	}
+	else {
+		fprintf(stderr, " OK\n");
+	}
+	fflush(stderr);
+#endif
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (z = 0; z < nslicesz; z++)
+		{
+	//		free(A[y][z]);
+		}
+	}
+
+	for (z = 0; z < nslicesz; z++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+	//		free(B[z][x]);
+		}
+	}
+
+	for (y = 0; y < nslicesy; y++)
+	{
+		for (x = 0; x < nslicesx; x++)
+		{
+	//		free(C[y][x]);
+		}
+	}
+
+	
+	
+}
+
+int xycounter;
+
+struct cb2_s {
+	unsigned blockx;
+	unsigned blocky;
+	unsigned iter;
+	int *xycounter;
+};
+
+
+static starpu_codelet cl = {
+	.where = CORE|CUBLAS|GORDON,
+	.core_func = core_mult,
+#ifdef USE_CUDA
+	.cublas_func = cublas_mult,
+#endif
+#ifdef USE_GORDON
+	.gordon_func = SPU_FUNC_SGEMM,
+#endif
+	.nbuffers = 3
+};
+
+
+static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, struct pos *posp)
+{
+	struct starpu_task *task;
+	task = starpu_task_create();
+
+	task->cl = &cl;
+
+	task->buffers[0].state = A_state[y][z];
+	task->buffers[0].mode = R;
+	task->buffers[1].state = B_state[z][x];
+	task->buffers[1].mode = R;
+	task->buffers[2].state = C_state[y][x];
+	task->buffers[2].mode = RW;
+
+	task->callback_func = callback_func_3;
+	task->callback_arg = posp;
+
+	task->cl_arg = &conf;
+	task->cl_arg_size = sizeof(struct block_conf);
+
+	posp->z = z;
+	posp->iter = iter;
+
+	starpu_submit_task(task);
+}
+
+
+static void callback_func(void *arg)
+{
+	/* the argument is a pointer to a counter of the remaining tasks */
+	int *counter = arg;
+	int newvalue = STARPU_ATOMIC_ADD(counter, -1);
+	if (newvalue == 0)
+	{
+		/* we are done */	
+		fprintf(stderr, "done ...\n");
+		terminate();
+	}
+
+	return;
+}
+
+static void callback_func_3(void *arg)
+{
+	/* the argument is a pointer to a counter of the remaining tasks */
+	struct pos *posp = arg;
+	unsigned x,y,z,iter;
+
+	iter = posp->iter;
+	x = posp->x;
+	y = posp->y;
+	z = posp->z;
+
+	if (z < nslicesz - 1)
+	{
+		construct_task(x, y, z+1, iter, posp);
+	}
+	else
+	{
+		if (iter < niter - 1)
+		{
+			construct_task(x, y, 0, iter+1, posp);
+		}
+		else
+		{
+			callback_func(&xycounter);
+		}
+	}
+}
+
+
+
+
+static void launch_codelets(void)
+{
+#ifdef USE_FXT
+	fxt_register_thread(0);
+#endif
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	/* only a callback per (nslicesz * niter) task given deps */
+	xycounter = nslicesx * nslicesy;
+
+	srand(time(NULL));
+
+	gettimeofday(&start, NULL);
+
+	for (taskx = 0; taskx < nslicesx; taskx++) 
+	for (tasky = 0; tasky < nslicesy; tasky++)
+	{
+		construct_task(taskx, tasky, 0, 0, &currentpos[tasky][taskx]);
+	}
+}
+
+int main(__attribute__ ((unused)) int argc, 
+	 __attribute__ ((unused)) char **argv)
+{
+
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+	init_problem_data();
+
+	launch_codelets();
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	cleanup_problem();
+
+	exit(-1);
+	starpu_shutdown();
+
+	return 0;
+}

+ 28 - 0
examples/pastix-wrappers/Makefile

@@ -0,0 +1,28 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+STARPU= ../../libstarpu.a
+
+all: starpu-blas-wrapper.a
+
+starpu-blas-wrapper.o: starpu-blas-wrapper.c starpu-blas-wrapper.h
+	$(CC) $(CFLAGS) starpu-blas-wrapper.c -c -o starpu-blas-wrapper.o
+
+starpu-blas-wrapper.a: starpu-blas-wrapper.o
+	$(AR) rcs starpu-blas-wrapper.a starpu-blas-wrapper.o
+
+clean:
+	@rm -f *.a *.o *.d *.gcno *.gcda

+ 34 - 0
examples/pastix-wrappers/generated_model.h

@@ -0,0 +1,34 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define GEMM_CPU_A  9.627597e-05
+#define GEMM_CPU_B  2.909318e-03
+#define GEMM_CPU_C  1.233512e-01
+#define GEMM_CPU_D  -2.789676e-01
+#define GEMM_CPU_E  -1.719838e-01
+#define GEMM_CPU_F  2.032491e+01
+#define GEMM_GPU_A  1.564597e-05
+#define GEMM_GPU_B  1.643119e-04
+#define GEMM_GPU_C  1.990316e-02
+#define GEMM_GPU_D  -1.120220e-02
+#define GEMM_GPU_E  2.416027e-01
+#define GEMM_GPU_F  1.974529e+01
+#define TRSM_GPU_A 4.302117e-06
+#define TRSM_GPU_B 5.423172e-01
+#define TRSM_GPU_C -4.868755e+00
+#define TRSM_CPU_A 1.362886e-05
+#define TRSM_CPU_B 6.283488e-01
+#define TRSM_CPU_C -4.053346e+01

+ 59 - 0
examples/pastix-wrappers/models/Makefile

@@ -0,0 +1,59 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+all: reg_gemm reg_trsm 
+
+reg_axpy: reg_axpy.c num_rec
+	gcc -o reg_axpy reg_axpy.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_copy: reg_copy.c num_rec
+	gcc -o reg_copy reg_copy.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_geam: reg_geam.c num_rec
+	gcc -o reg_geam reg_geam.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_gemm: reg_gemm.c num_rec
+	gcc -g -o reg_gemm reg_gemm.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_pof: reg_pof.c num_rec
+	gcc -o reg_pof reg_pof.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_ppf: reg_ppf.c num_rec
+	gcc -o reg_ppf reg_ppf.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_scal: reg_scal.c num_rec
+	gcc -o reg_scal reg_scal.c num_recipes/*.o -Inum_recipes/ -lm
+
+reg_trsm: reg_trsm.c num_rec
+	gcc -g -o reg_trsm reg_trsm.c num_recipes/*.o -Inum_recipes/ -lm
+
+num_rec: covsrt.o gaussj.o lfit.o nrutil.o num_recipes/nr.h num_recipes/nrutil.h num_recipes/complex.h
+
+covsrt.o: num_recipes/covsrt.c
+	gcc -g -o num_recipes/covsrt.o -c num_recipes/covsrt.c
+
+gaussj.o: num_recipes/gaussj.c
+	gcc -g -o num_recipes/gaussj.o -c num_recipes/gaussj.c
+
+lfit.o: num_recipes/lfit.c
+	gcc -g -o num_recipes/lfit.o -c num_recipes/lfit.c
+
+nrutil.o: num_recipes/nrutil.c
+	gcc -g -o num_recipes/nrutil.o -c num_recipes/nrutil.c
+
+clean:
+	rm -f reg_gemm reg_trsm
+	rm -rf *.o num_recipes/*.o

+ 31 - 0
examples/pastix-wrappers/models/model.sh

@@ -0,0 +1,31 @@
+#!/bin/bash
+
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+
+rm -f generated_model.h
+
+# contrib compact 
+./reg_gemm /home/gonnet/These/StarPU-stable/.sampling/starpu_compute_contrib_compact.barracuda.core.debug `wc -l /home/gonnet/These/StarPU-stable/.sampling/starpu_compute_contrib_compact.barracuda.core.debug` 2> /dev/null|sed -s s/GEMM/GEMM_CPU/ >  generated_model.h
+./reg_gemm /home/gonnet/These/StarPU-stable/.sampling/starpu_compute_contrib_compact.barracuda.cuda.debug `wc -l /home/gonnet/These/StarPU-stable/.sampling/starpu_compute_contrib_compact.barracuda.cuda.debug` 2> /dev/null|sed -s s/GEMM/GEMM_GPU/ >>  generated_model.h
+
+# strsm
+
+./reg_trsm /home/gonnet/These/StarPU-stable/.sampling/starpu_cblk_strsm.barracuda.cuda.debug `wc -l /home/gonnet/These/StarPU-stable/.sampling/starpu_cblk_strsm.barracuda.cuda.debug` 2> /dev/null|sed -s s/TRSM/TRSM_GPU/ >>  generated_model.h
+./reg_trsm /home/gonnet/These/StarPU-stable/.sampling/starpu_cblk_strsm.barracuda.core.debug `wc -l /home/gonnet/These/StarPU-stable/.sampling/starpu_cblk_strsm.barracuda.core.debug` 2> /dev/null|sed -s s/TRSM/TRSM_CPU/ >>  generated_model.h
+
+cat generated_model.h

+ 26 - 0
examples/pastix-wrappers/models/num_recipes/complex.h

@@ -0,0 +1,26 @@
+/* CAUTION: This is the ANSI C (only) version of the Numerical Recipes
+   utility file complex.h.  Do not confuse this file with the same-named
+   file complex.h that is supplied in the 'misc' subdirectory.
+   *That* file is the one from the book, and contains both ANSI and
+   traditional K&R versions, along with #ifdef macros to select the
+   correct version.  *This* file contains only ANSI C.               */
+
+#ifndef _NR_COMPLEX_H_
+#define _NR_COMPLEX_H_
+
+#ifndef _FCOMPLEX_DECLARE_T_
+typedef struct FCOMPLEX {float r,i;} fcomplex;
+#define _FCOMPLEX_DECLARE_T_
+#endif /* _FCOMPLEX_DECLARE_T_ */
+
+fcomplex Cadd(fcomplex a, fcomplex b);
+fcomplex Csub(fcomplex a, fcomplex b);
+fcomplex Cmul(fcomplex a, fcomplex b);
+fcomplex Complex(float re, float im);
+fcomplex Conjg(fcomplex z);
+fcomplex Cdiv(fcomplex a, fcomplex b);
+float Cabs(fcomplex z);
+fcomplex Csqrt(fcomplex z);
+fcomplex RCmul(float x, fcomplex a);
+
+#endif /* _NR_COMPLEX_H_ */

+ 20 - 0
examples/pastix-wrappers/models/num_recipes/covsrt.c

@@ -0,0 +1,20 @@
+#define REAL double
+#define SWAP(a,b) {swap=(a);(a)=(b);(b)=swap;}
+
+void covsrt(REAL **covar, int ma, int ia[], int mfit)
+{
+	int i,j,k;
+	REAL swap;
+
+	for (i=mfit+1;i<=ma;i++)
+		for (j=1;j<=i;j++) covar[i][j]=covar[j][i]=0.0;
+	k=mfit;
+	for (j=ma;j>=1;j--) {
+		if (ia[j]) {
+			for (i=1;i<=ma;i++) SWAP(covar[i][k],covar[i][j])
+			for (i=1;i<=ma;i++) SWAP(covar[k][i],covar[j][i])
+			k--;
+		}
+	}
+}
+#undef SWAP

+ 60 - 0
examples/pastix-wrappers/models/num_recipes/gaussj.c

@@ -0,0 +1,60 @@
+#include <math.h>
+#define NRANSI
+#include "nrutil.h"
+//#define REAL float
+#define SWAP(a,b) {temp=(a);(a)=(b);(b)=temp;}
+
+void gaussj(REAL **a, int n, REAL **b, int m)
+{
+	int *indxc,*indxr,*ipiv;
+	int i,icol,irow,j,k,l,ll;
+	REAL big,dum,pivinv,temp;
+
+	indxc=ivector(1,n);
+	indxr=ivector(1,n);
+	ipiv=ivector(1,n);
+	for (j=1;j<=n;j++) ipiv[j]=0;
+	for (i=1;i<=n;i++) {
+		big=0.0;
+		for (j=1;j<=n;j++)
+			if (ipiv[j] != 1)
+				for (k=1;k<=n;k++) {
+					if (ipiv[k] == 0) {
+						if (fabs(a[j][k]) >= big) {
+							big=fabs(a[j][k]);
+							irow=j;
+							icol=k;
+						}
+					} else if (ipiv[k] > 1) nrerror("gaussj: Singular Matrix-1");
+				}
+		++(ipiv[icol]);
+		if (irow != icol) {
+			for (l=1;l<=n;l++) SWAP(a[irow][l],a[icol][l])
+			for (l=1;l<=m;l++) SWAP(b[irow][l],b[icol][l])
+		}
+		indxr[i]=irow;
+		indxc[i]=icol;
+		if (a[icol][icol] == 0.0) nrerror("gaussj: Singular Matrix-2");
+		pivinv=1.0/a[icol][icol];
+		a[icol][icol]=1.0;
+		for (l=1;l<=n;l++) a[icol][l] *= pivinv;
+		for (l=1;l<=m;l++) b[icol][l] *= pivinv;
+		for (ll=1;ll<=n;ll++)
+			if (ll != icol) {
+				dum=a[ll][icol];
+				a[ll][icol]=0.0;
+				for (l=1;l<=n;l++) a[ll][l] -= a[icol][l]*dum;
+				for (l=1;l<=m;l++) b[ll][l] -= b[icol][l]*dum;
+			}
+	}
+	for (l=n;l>=1;l--) {
+		if (indxr[l] != indxc[l])
+			for (k=1;k<=n;k++)
+				SWAP(a[k][indxr[l]],a[k][indxc[l]]);
+	}
+	free_ivector(ipiv,1,n);
+	free_ivector(indxr,1,n);
+	free_ivector(indxc,1,n);
+}
+#undef SWAP
+#undef NRANSI

+ 66 - 0
examples/pastix-wrappers/models/num_recipes/lfit.c

@@ -0,0 +1,66 @@
+#define NRANSI
+#include <stdio.h>
+#include <stdlib.h>
+#include "nrutil.h"
+
+#define REAL float
+
+void lfit(REAL x[], REAL y[], REAL sig[], int ndat, REAL a[], int ia[],
+	int ma, REAL **covar, REAL *chisq, void (*funcs)(REAL, REAL [], int))
+{
+	void covsrt(REAL **covar, int ma, int ia[], int mfit);
+	void gaussj(REAL **a, int n, REAL **b, int m);
+	int i,j,k,l,m,mfit=0;
+	REAL ym,wt,sum,sig2i,**beta,*afunc;
+
+	
+
+	beta=matrix(1,ma,1,1);
+	afunc=vector(1,ma);
+	for (j=1;j<=ma;j++)
+		if (ia[j]) mfit++;
+	if (mfit == 0) nrerror("lfit: no parameters to be fitted");
+	for (j=1;j<=mfit;j++) {
+		for (k=1;k<=mfit;k++) covar[j][k]=0.0;
+		beta[j][1]=0.0;
+	}
+
+	for (i=1;i<=ndat;i++) {
+		(*funcs)(x[i],afunc,ma);
+		ym=y[i];
+		if (mfit < ma) {
+			for (j=1;j<=ma;j++)
+				if (!ia[j]) ym -= a[j]*afunc[j];
+		}
+		sig2i=1.0/SQR(sig[i]);
+		for (j=0,l=1;l<=ma;l++) {
+			if (ia[l]) {
+				wt=afunc[l]*sig2i;
+				for (j++,k=0,m=1;m<=l;m++)
+					if (ia[m]) covar[j][++k] += wt*afunc[m];
+				beta[j][1] += ym*wt;
+			}
+		}
+	}
+	
+	for (j=2;j<=mfit;j++)
+		for (k=1;k<j;k++)
+			covar[k][j]=covar[j][k];
+//	printf("lfit : gaussj\n");	
+	gaussj(covar,mfit,beta,1);
+//	printf("lfit1\n");
+	for (j=0,l=1;l<=ma;l++)
+		if (ia[l]) a[l]=beta[++j][1];
+//	printf("lfit2\n");
+	*chisq=0.0;
+
+	for (i=1;i<=ndat;i++) {
+		(*funcs)(x[i],afunc,ma);
+		for (sum=0.0,j=1;j<=ma;j++) sum += a[j]*afunc[j];
+		*chisq += SQR((y[i]-sum)/sig[i]);
+	}
+	covsrt(covar,ma,ia,mfit);
+	free_vector(afunc,1,ma);
+	free_matrix(beta,1,ma,1,1);
+}
+#undef NRANSI

+ 530 - 0
examples/pastix-wrappers/models/num_recipes/nr.h

@@ -0,0 +1,530 @@
+/* CAUTION: This is the ANSI C (only) version of the Numerical Recipes
+   utility file nr.h.  Do not confuse this file with the same-named
+   file nr.h that is supplied in the 'misc' subdirectory.
+   *That* file is the one from the book, and contains both ANSI and
+   traditional K&R versions, along with #ifdef macros to select the
+   correct version.  *This* file contains only ANSI C.               */
+
+#ifndef _NR_H_
+#define _NR_H_
+
+#define REAL double
+
+
+#ifndef _FCOMPLEX_DECLARE_T_
+typedef struct FCOMPLEX {REAL r,i;} fcomplex;
+#define _FCOMPLEX_DECLARE_T_
+#endif /* _FCOMPLEX_DECLARE_T_ */
+
+#ifndef _ARITHCODE_DECLARE_T_
+typedef struct {
+	unsigned long *ilob,*iupb,*ncumfq,jdif,nc,minint,nch,ncum,nrad;
+} arithcode;
+#define _ARITHCODE_DECLARE_T_
+#endif /* _ARITHCODE_DECLARE_T_ */
+
+#ifndef _HUFFCODE_DECLARE_T_
+typedef struct {
+	unsigned long *icod,*ncod,*left,*right,nch,nodemax;
+} huffcode;
+#define _HUFFCODE_DECLARE_T_
+#endif /* _HUFFCODE_DECLARE_T_ */
+
+#include <stdio.h>
+
+void addint(double **uf, double **uc, double **res, int nf);
+void airy(REAL x, REAL *ai, REAL *bi, REAL *aip, REAL *bip);
+void amebsa(REAL **p, REAL y[], int ndim, REAL pb[],	REAL *yb,
+	REAL ftol, REAL (*funk)(REAL []), int *iter, REAL temptr);
+void amoeba(REAL **p, REAL y[], int ndim, REAL ftol,
+	REAL (*funk)(REAL []), int *iter);
+REAL amotry(REAL **p, REAL y[], REAL psum[], int ndim,
+	REAL (*funk)(REAL []), int ihi, REAL fac);
+REAL amotsa(REAL **p, REAL y[], REAL psum[], int ndim, REAL pb[],
+	REAL *yb, REAL (*funk)(REAL []), int ihi, REAL *yhi, REAL fac);
+void anneal(REAL x[], REAL y[], int iorder[], int ncity);
+double anorm2(double **a, int n);
+void arcmak(unsigned long nfreq[], unsigned long nchh, unsigned long nradd,
+	arithcode *acode);
+void arcode(unsigned long *ich, unsigned char **codep, unsigned long *lcode,
+	unsigned long *lcd, int isign, arithcode *acode);
+void arcsum(unsigned long iin[], unsigned long iout[], unsigned long ja,
+	int nwk, unsigned long nrad, unsigned long nc);
+void asolve(unsigned long n, double b[], double x[], int itrnsp);
+void atimes(unsigned long n, double x[], double r[], int itrnsp);
+void avevar(REAL data[], unsigned long n, REAL *ave, REAL *var);
+void balanc(REAL **a, int n);
+void banbks(REAL **a, unsigned long n, int m1, int m2, REAL **al,
+	unsigned long indx[], REAL b[]);
+void bandec(REAL **a, unsigned long n, int m1, int m2, REAL **al,
+	unsigned long indx[], REAL *d);
+void banmul(REAL **a, unsigned long n, int m1, int m2, REAL x[], REAL b[]);
+void bcucof(REAL y[], REAL y1[], REAL y2[], REAL y12[], REAL d1,
+	REAL d2, REAL **c);
+void bcuint(REAL y[], REAL y1[], REAL y2[], REAL y12[],
+	REAL x1l, REAL x1u, REAL x2l, REAL x2u, REAL x1,
+	REAL x2, REAL *ansy, REAL *ansy1, REAL *ansy2);
+void beschb(double x, double *gam1, double *gam2, double *gampl,
+	double *gammi);
+REAL bessi(int n, REAL x);
+REAL bessi0(REAL x);
+REAL bessi1(REAL x);
+void bessik(REAL x, REAL xnu, REAL *ri, REAL *rk, REAL *rip,
+	REAL *rkp);
+REAL bessj(int n, REAL x);
+REAL bessj0(REAL x);
+REAL bessj1(REAL x);
+void bessjy(REAL x, REAL xnu, REAL *rj, REAL *ry, REAL *rjp,
+	REAL *ryp);
+REAL bessk(int n, REAL x);
+REAL bessk0(REAL x);
+REAL bessk1(REAL x);
+REAL bessy(int n, REAL x);
+REAL bessy0(REAL x);
+REAL bessy1(REAL x);
+REAL beta(REAL z, REAL w);
+REAL betacf(REAL a, REAL b, REAL x);
+REAL betai(REAL a, REAL b, REAL x);
+REAL bico(int n, int k);
+void bksub(int ne, int nb, int jf, int k1, int k2, REAL ***c);
+REAL bnldev(REAL pp, int n, long *idum);
+REAL brent(REAL ax, REAL bx, REAL cx,
+	REAL (*f)(REAL), REAL tol, REAL *xmin);
+void broydn(REAL x[], int n, int *check,
+	void (*vecfunc)(int, REAL [], REAL []));
+void bsstep(REAL y[], REAL dydx[], int nv, REAL *xx, REAL htry,
+	REAL eps, REAL yscal[], REAL *hdid, REAL *hnext,
+	void (*derivs)(REAL, REAL [], REAL []));
+void caldat(long julian, int *mm, int *id, int *iyyy);
+void chder(REAL a, REAL b, REAL c[], REAL cder[], int n);
+REAL chebev(REAL a, REAL b, REAL c[], int m, REAL x);
+void chebft(REAL a, REAL b, REAL c[], int n, REAL (*func)(REAL));
+void chebpc(REAL c[], REAL d[], int n);
+void chint(REAL a, REAL b, REAL c[], REAL cint[], int n);
+REAL chixy(REAL bang);
+void choldc(REAL **a, int n, REAL p[]);
+void cholsl(REAL **a, int n, REAL p[], REAL b[], REAL x[]);
+void chsone(REAL bins[], REAL ebins[], int nbins, int knstrn,
+	REAL *df, REAL *chsq, REAL *prob);
+void chstwo(REAL bins1[], REAL bins2[], int nbins, int knstrn,
+	REAL *df, REAL *chsq, REAL *prob);
+void cisi(REAL x, REAL *ci, REAL *si);
+void cntab1(int **nn, int ni, int nj, REAL *chisq,
+	REAL *df, REAL *prob, REAL *cramrv, REAL *ccc);
+void cntab2(int **nn, int ni, int nj, REAL *h, REAL *hx, REAL *hy,
+	REAL *hygx, REAL *hxgy, REAL *uygx, REAL *uxgy, REAL *uxy);
+void convlv(REAL data[], unsigned long n, REAL respns[], unsigned long m,
+	int isign, REAL ans[]);
+void copy(double **aout, double **ain, int n);
+void correl(REAL data1[], REAL data2[], unsigned long n, REAL ans[]);
+void cosft(REAL y[], int n, int isign);
+void cosft1(REAL y[], int n);
+void cosft2(REAL y[], int n, int isign);
+void covsrt(REAL **covar, int ma, int ia[], int mfit);
+void crank(unsigned long n, REAL w[], REAL *s);
+void cyclic(REAL a[], REAL b[], REAL c[], REAL alpha, REAL beta,
+	REAL r[], REAL x[], unsigned long n);
+void daub4(REAL a[], unsigned long n, int isign);
+REAL dawson(REAL x);
+REAL dbrent(REAL ax, REAL bx, REAL cx,
+	REAL (*f)(REAL), REAL (*df)(REAL), REAL tol, REAL *xmin);
+void ddpoly(REAL c[], int nc, REAL x, REAL pd[], int nd);
+int decchk(char string[], int n, char *ch);
+void derivs(REAL x, REAL y[], REAL dydx[]);
+REAL df1dim(REAL x);
+void dfour1(double data[], unsigned long nn, int isign);
+void dfpmin(REAL p[], int n, REAL gtol, int *iter, REAL *fret,
+	REAL (*func)(REAL []), void (*dfunc)(REAL [], REAL []));
+REAL dfridr(REAL (*func)(REAL), REAL x, REAL h, REAL *err);
+void dftcor(REAL w, REAL delta, REAL a, REAL b, REAL endpts[],
+	REAL *corre, REAL *corim, REAL *corfac);
+void dftint(REAL (*func)(REAL), REAL a, REAL b, REAL w,
+	REAL *cosint, REAL *sinint);
+void difeq(int k, int k1, int k2, int jsf, int is1, int isf,
+	int indexv[], int ne, REAL **s, REAL **y);
+void dlinmin(REAL p[], REAL xi[], int n, REAL *fret,
+	REAL (*func)(REAL []), void (*dfunc)(REAL [], REAL[]));
+double dpythag(double a, double b);
+void drealft(double data[], unsigned long n, int isign);
+void dsprsax(double sa[], unsigned long ija[], double x[], double b[],
+	unsigned long n);
+void dsprstx(double sa[], unsigned long ija[], double x[], double b[],
+	unsigned long n);
+void dsvbksb(double **u, double w[], double **v, int m, int n, double b[],
+	double x[]);
+void dsvdcmp(double **a, int m, int n, double w[], double **v);
+void eclass(int nf[], int n, int lista[], int listb[], int m);
+void eclazz(int nf[], int n, int (*equiv)(int, int));
+REAL ei(REAL x);
+void eigsrt(REAL d[], REAL **v, int n);
+REAL elle(REAL phi, REAL ak);
+REAL ellf(REAL phi, REAL ak);
+REAL ellpi(REAL phi, REAL en, REAL ak);
+void elmhes(REAL **a, int n);
+REAL erfcc(REAL x);
+//REAL erff(REAL x);
+REAL erffc(REAL x);
+void eulsum(REAL *sum, REAL term, int jterm, REAL wksp[]);
+REAL evlmem(REAL fdt, REAL d[], int m, REAL xms);
+REAL expdev(long *idum);
+REAL expint(int n, REAL x);
+REAL f1(REAL x);
+REAL f1dim(REAL x);
+REAL f2(REAL y);
+REAL f3(REAL z);
+REAL factln(int n);
+REAL factrl(int n);
+void fasper(REAL x[], REAL y[], unsigned long n, REAL ofac, REAL hifac,
+	REAL wk1[], REAL wk2[], unsigned long nwk, unsigned long *nout,
+	unsigned long *jmax, REAL *prob);
+void fdjac(int n, REAL x[], REAL fvec[], REAL **df,
+	void (*vecfunc)(int, REAL [], REAL []));
+void fgauss(REAL x, REAL a[], REAL *y, REAL dyda[], int na);
+void fill0(double **u, int n);
+void fit(REAL x[], REAL y[], int ndata, REAL sig[], int mwt,
+	REAL *a, REAL *b, REAL *siga, REAL *sigb, REAL *chi2, REAL *q);
+void fitexy(REAL x[], REAL y[], int ndat, REAL sigx[], REAL sigy[],
+	REAL *a, REAL *b, REAL *siga, REAL *sigb, REAL *chi2, REAL *q);
+void fixrts(REAL d[], int m);
+void fleg(REAL x, REAL pl[], int nl);
+void flmoon(int n, int nph, long *jd, REAL *frac);
+REAL fmin(REAL x[]);
+void four1(REAL data[], unsigned long nn, int isign);
+void fourew(FILE *file[5], int *na, int *nb, int *nc, int *nd);
+void fourfs(FILE *file[5], unsigned long nn[], int ndim, int isign);
+void fourn(REAL data[], unsigned long nn[], int ndim, int isign);
+void fpoly(REAL x, REAL p[], int np);
+void fred2(int n, REAL a, REAL b, REAL t[], REAL f[], REAL w[],
+	REAL (*g)(REAL), REAL (*ak)(REAL, REAL));
+REAL fredin(REAL x, int n, REAL a, REAL b, REAL t[], REAL f[], REAL w[],
+	REAL (*g)(REAL), REAL (*ak)(REAL, REAL));
+void frenel(REAL x, REAL *s, REAL *c);
+void frprmn(REAL p[], int n, REAL ftol, int *iter, REAL *fret,
+	REAL (*func)(REAL []), void (*dfunc)(REAL [], REAL []));
+void ftest(REAL data1[], unsigned long n1, REAL data2[], unsigned long n2,
+	REAL *f, REAL *prob);
+REAL gamdev(int ia, long *idum);
+REAL gammln(REAL xx);
+REAL gammp(REAL a, REAL x);
+REAL gammq(REAL a, REAL x);
+REAL gasdev(long *idum);
+void gaucof(int n, REAL a[], REAL b[], REAL amu0, REAL x[], REAL w[]);
+void gauher(REAL x[], REAL w[], int n);
+void gaujac(REAL x[], REAL w[], int n, REAL alf, REAL bet);
+void gaulag(REAL x[], REAL w[], int n, REAL alf);
+void gauleg(REAL x1, REAL x2, REAL x[], REAL w[], int n);
+void gaussj(REAL **a, int n, REAL **b, int m);
+void gcf(REAL *gammcf, REAL a, REAL x, REAL *gln);
+REAL golden(REAL ax, REAL bx, REAL cx, REAL (*f)(REAL), REAL tol,
+	REAL *xmin);
+void gser(REAL *gamser, REAL a, REAL x, REAL *gln);
+void hpsel(unsigned long m, unsigned long n, REAL arr[], REAL heap[]);
+void hpsort(unsigned long n, REAL ra[]);
+void hqr(REAL **a, int n, REAL wr[], REAL wi[]);
+void hufapp(unsigned long index[], unsigned long nprob[], unsigned long n,
+	unsigned long i);
+void hufdec(unsigned long *ich, unsigned char *code, unsigned long lcode,
+	unsigned long *nb, huffcode *hcode);
+void hufenc(unsigned long ich, unsigned char **codep, unsigned long *lcode,
+	unsigned long *nb, huffcode *hcode);
+void hufmak(unsigned long nfreq[], unsigned long nchin, unsigned long *ilong,
+	unsigned long *nlong, huffcode *hcode);
+void hunt(REAL xx[], unsigned long n, REAL x, unsigned long *jlo);
+void hypdrv(REAL s, REAL yy[], REAL dyyds[]);
+fcomplex hypgeo(fcomplex a, fcomplex b, fcomplex c, fcomplex z);
+void hypser(fcomplex a, fcomplex b, fcomplex c, fcomplex z,
+	fcomplex *series, fcomplex *deriv);
+unsigned short icrc(unsigned short crc, unsigned char *bufptr,
+	unsigned long len, short jinit, int jrev);
+unsigned short icrc1(unsigned short crc, unsigned char onech);
+unsigned long igray(unsigned long n, int is);
+void iindexx(unsigned long n, long arr[], unsigned long indx[]);
+void indexx(unsigned long n, REAL arr[], unsigned long indx[]);
+void interp(double **uf, double **uc, int nf);
+int irbit1(unsigned long *iseed);
+int irbit2(unsigned long *iseed);
+void jacobi(REAL **a, int n, REAL d[], REAL **v, int *nrot);
+void jacobn(REAL x, REAL y[], REAL dfdx[], REAL **dfdy, int n);
+long julday(int mm, int id, int iyyy);
+void kendl1(REAL data1[], REAL data2[], unsigned long n, REAL *tau, REAL *z,
+	REAL *prob);
+void kendl2(REAL **tab, int i, int j, REAL *tau, REAL *z, REAL *prob);
+void kermom(double w[], double y, int m);
+void ks2d1s(REAL x1[], REAL y1[], unsigned long n1,
+	void (*quadvl)(REAL, REAL, REAL *, REAL *, REAL *, REAL *),
+	REAL *d1, REAL *prob);
+void ks2d2s(REAL x1[], REAL y1[], unsigned long n1, REAL x2[], REAL y2[],
+	unsigned long n2, REAL *d, REAL *prob);
+void ksone(REAL data[], unsigned long n, REAL (*func)(REAL), REAL *d,
+	REAL *prob);
+void kstwo(REAL data1[], unsigned long n1, REAL data2[], unsigned long n2,
+	REAL *d, REAL *prob);
+void laguer(fcomplex a[], int m, fcomplex *x, int *its);
+void lfit(REAL x[], REAL y[], REAL sig[], int ndat, REAL a[], int ia[],
+	int ma, REAL **covar, REAL *chisq, void (*funcs)(REAL, REAL [], int));
+void linbcg(unsigned long n, double b[], double x[], int itol, double tol,
+	 int itmax, int *iter, double *err);
+void linmin(REAL p[], REAL xi[], int n, REAL *fret,
+	REAL (*func)(REAL []));
+void lnsrch(int n, REAL xold[], REAL fold, REAL g[], REAL p[], REAL x[],
+	 REAL *f, REAL stpmax, int *check, REAL (*func)(REAL []));
+void load(REAL x1, REAL v[], REAL y[]);
+void load1(REAL x1, REAL v1[], REAL y[]);
+void load2(REAL x2, REAL v2[], REAL y[]);
+void locate(REAL xx[], unsigned long n, REAL x, unsigned long *j);
+void lop(double **out, double **u, int n);
+void lubksb(REAL **a, int n, int *indx, REAL b[]);
+void ludcmp(REAL **a, int n, int *indx, REAL *d);
+void machar(int *ibeta, int *it, int *irnd, int *ngrd,
+	int *machep, int *negep, int *iexp, int *minexp, int *maxexp,
+	REAL *eps, REAL *epsneg, REAL *xmin, REAL *xmax);
+void matadd(double **a, double **b, double **c, int n);
+void matsub(double **a, double **b, double **c, int n);
+void medfit(REAL x[], REAL y[], int ndata, REAL *a, REAL *b, REAL *abdev);
+void memcof(REAL data[], int n, int m, REAL *xms, REAL d[]);
+int metrop(REAL de, REAL t);
+void mgfas(double **u, int n, int maxcyc);
+void mglin(double **u, int n, int ncycle);
+REAL midexp(REAL (*funk)(REAL), REAL aa, REAL bb, int n);
+REAL midinf(REAL (*funk)(REAL), REAL aa, REAL bb, int n);
+REAL midpnt(REAL (*func)(REAL), REAL a, REAL b, int n);
+REAL midsql(REAL (*funk)(REAL), REAL aa, REAL bb, int n);
+REAL midsqu(REAL (*funk)(REAL), REAL aa, REAL bb, int n);
+void miser(REAL (*func)(REAL []), REAL regn[], int ndim, unsigned long npts,
+	REAL dith, REAL *ave, REAL *var);
+void mmid(REAL y[], REAL dydx[], int nvar, REAL xs, REAL htot,
+	int nstep, REAL yout[], void (*derivs)(REAL, REAL[], REAL[]));
+void mnbrak(REAL *ax, REAL *bx, REAL *cx, REAL *fa, REAL *fb,
+	REAL *fc, REAL (*func)(REAL));
+void mnewt(int ntrial, REAL x[], int n, REAL tolx, REAL tolf);
+void moment(REAL data[], int n, REAL *ave, REAL *adev, REAL *sdev,
+	REAL *var, REAL *skew, REAL *curt);
+void mp2dfr(unsigned char a[], unsigned char s[], int n, int *m);
+void mpadd(unsigned char w[], unsigned char u[], unsigned char v[], int n);
+void mpdiv(unsigned char q[], unsigned char r[], unsigned char u[],
+	unsigned char v[], int n, int m);
+void mpinv(unsigned char u[], unsigned char v[], int n, int m);
+void mplsh(unsigned char u[], int n);
+void mpmov(unsigned char u[], unsigned char v[], int n);
+void mpmul(unsigned char w[], unsigned char u[], unsigned char v[], int n,
+	int m);
+void mpneg(unsigned char u[], int n);
+void mppi(int n);
+void mprove(REAL **a, REAL **alud, int n, int indx[], REAL b[],
+	REAL x[]);
+void mpsad(unsigned char w[], unsigned char u[], int n, int iv);
+void mpsdv(unsigned char w[], unsigned char u[], int n, int iv, int *ir);
+void mpsmu(unsigned char w[], unsigned char u[], int n, int iv);
+void mpsqrt(unsigned char w[], unsigned char u[], unsigned char v[], int n,
+	int m);
+void mpsub(int *is, unsigned char w[], unsigned char u[], unsigned char v[],
+	int n);
+void mrqcof(REAL x[], REAL y[], REAL sig[], int ndata, REAL a[],
+	int ia[], int ma, REAL **alpha, REAL beta[], REAL *chisq,
+	void (*funcs)(REAL, REAL [], REAL *, REAL [], int));
+void mrqmin(REAL x[], REAL y[], REAL sig[], int ndata, REAL a[],
+	int ia[], int ma, REAL **covar, REAL **alpha, REAL *chisq,
+	void (*funcs)(REAL, REAL [], REAL *, REAL [], int), REAL *alamda);
+void newt(REAL x[], int n, int *check,
+	void (*vecfunc)(int, REAL [], REAL []));
+void odeint(REAL ystart[], int nvar, REAL x1, REAL x2,
+	REAL eps, REAL h1, REAL hmin, int *nok, int *nbad,
+	void (*derivs)(REAL, REAL [], REAL []),
+	void (*rkqs)(REAL [], REAL [], int, REAL *, REAL, REAL,
+	REAL [], REAL *, REAL *, void (*)(REAL, REAL [], REAL [])));
+void orthog(int n, REAL anu[], REAL alpha[], REAL beta[], REAL a[],
+	REAL b[]);
+void pade(double cof[], int n, REAL *resid);
+void pccheb(REAL d[], REAL c[], int n);
+void pcshft(REAL a, REAL b, REAL d[], int n);
+void pearsn(REAL x[], REAL y[], unsigned long n, REAL *r, REAL *prob,
+	REAL *z);
+void period(REAL x[], REAL y[], int n, REAL ofac, REAL hifac,
+	REAL px[], REAL py[], int np, int *nout, int *jmax, REAL *prob);
+void piksr2(int n, REAL arr[], REAL brr[]);
+void piksrt(int n, REAL arr[]);
+void pinvs(int ie1, int ie2, int je1, int jsf, int jc1, int k,
+	REAL ***c, REAL **s);
+REAL plgndr(int l, int m, REAL x);
+REAL poidev(REAL xm, long *idum);
+void polcoe(REAL x[], REAL y[], int n, REAL cof[]);
+void polcof(REAL xa[], REAL ya[], int n, REAL cof[]);
+void poldiv(REAL u[], int n, REAL v[], int nv, REAL q[], REAL r[]);
+void polin2(REAL x1a[], REAL x2a[], REAL **ya, int m, int n,
+	REAL x1, REAL x2, REAL *y, REAL *dy);
+void polint(REAL xa[], REAL ya[], int n, REAL x, REAL *y, REAL *dy);
+void powell(REAL p[], REAL **xi, int n, REAL ftol, int *iter, REAL *fret,
+	REAL (*func)(REAL []));
+void predic(REAL data[], int ndata, REAL d[], int m, REAL future[], int nfut);
+REAL probks(REAL alam);
+void psdes(unsigned long *lword, unsigned long *irword);
+void pwt(REAL a[], unsigned long n, int isign);
+void pwtset(int n);
+REAL pythag(REAL a, REAL b);
+void pzextr(int iest, REAL xest, REAL yest[], REAL yz[], REAL dy[],
+	int nv);
+REAL qgaus(REAL (*func)(REAL), REAL a, REAL b);
+void qrdcmp(REAL **a, int n, REAL *c, REAL *d, int *sing);
+REAL qromb(REAL (*func)(REAL), REAL a, REAL b);
+REAL qromo(REAL (*func)(REAL), REAL a, REAL b,
+	REAL (*choose)(REAL (*)(REAL), REAL, REAL, int));
+void qroot(REAL p[], int n, REAL *b, REAL *c, REAL eps);
+void qrsolv(REAL **a, int n, REAL c[], REAL d[], REAL b[]);
+void qrupdt(REAL **r, REAL **qt, int n, REAL u[], REAL v[]);
+REAL qsimp(REAL (*func)(REAL), REAL a, REAL b);
+REAL qtrap(REAL (*func)(REAL), REAL a, REAL b);
+REAL quad3d(REAL (*func)(REAL, REAL, REAL), REAL x1, REAL x2);
+void quadct(REAL x, REAL y, REAL xx[], REAL yy[], unsigned long nn,
+	REAL *fa, REAL *fb, REAL *fc, REAL *fd);
+void quadmx(REAL **a, int n);
+void quadvl(REAL x, REAL y, REAL *fa, REAL *fb, REAL *fc, REAL *fd);
+REAL ran0(long *idum);
+REAL ran1(long *idum);
+REAL ran2(long *idum);
+REAL ran3(long *idum);
+REAL ran4(long *idum);
+void rank(unsigned long n, unsigned long indx[], unsigned long irank[]);
+void ranpt(REAL pt[], REAL regn[], int n);
+void ratint(REAL xa[], REAL ya[], int n, REAL x, REAL *y, REAL *dy);
+void ratlsq(double (*fn)(double), double a, double b, int mm, int kk,
+	double cof[], double *dev);
+double ratval(double x, double cof[], int mm, int kk);
+REAL rc(REAL x, REAL y);
+REAL rd(REAL x, REAL y, REAL z);
+void realft(REAL data[], unsigned long n, int isign);
+void rebin(REAL rc, int nd, REAL r[], REAL xin[], REAL xi[]);
+void red(int iz1, int iz2, int jz1, int jz2, int jm1, int jm2, int jmf,
+	int ic1, int jc1, int jcf, int kc, REAL ***c, REAL **s);
+void relax(double **u, double **rhs, int n);
+void relax2(double **u, double **rhs, int n);
+void resid(double **res, double **u, double **rhs, int n);
+REAL revcst(REAL x[], REAL y[], int iorder[], int ncity, int n[]);
+void reverse(int iorder[], int ncity, int n[]);
+REAL rf(REAL x, REAL y, REAL z);
+REAL rj(REAL x, REAL y, REAL z, REAL p);
+void rk4(REAL y[], REAL dydx[], int n, REAL x, REAL h, REAL yout[],
+	void (*derivs)(REAL, REAL [], REAL []));
+void rkck(REAL y[], REAL dydx[], int n, REAL x, REAL h,
+	REAL yout[], REAL yerr[], void (*derivs)(REAL, REAL [], REAL []));
+void rkdumb(REAL vstart[], int nvar, REAL x1, REAL x2, int nstep,
+	void (*derivs)(REAL, REAL [], REAL []));
+void rkqs(REAL y[], REAL dydx[], int n, REAL *x,
+	REAL htry, REAL eps, REAL yscal[], REAL *hdid, REAL *hnext,
+	void (*derivs)(REAL, REAL [], REAL []));
+void rlft3(REAL ***data, REAL **speq, unsigned long nn1,
+	unsigned long nn2, unsigned long nn3, int isign);
+REAL rofunc(REAL b);
+void rotate(REAL **r, REAL **qt, int n, int i, REAL a, REAL b);
+void rsolv(REAL **a, int n, REAL d[], REAL b[]);
+void rstrct(double **uc, double **uf, int nc);
+REAL rtbis(REAL (*func)(REAL), REAL x1, REAL x2, REAL xacc);
+REAL rtflsp(REAL (*func)(REAL), REAL x1, REAL x2, REAL xacc);
+REAL rtnewt(void (*funcd)(REAL, REAL *, REAL *), REAL x1, REAL x2,
+	REAL xacc);
+REAL rtsafe(void (*funcd)(REAL, REAL *, REAL *), REAL x1, REAL x2,
+	REAL xacc);
+REAL rtsec(REAL (*func)(REAL), REAL x1, REAL x2, REAL xacc);
+void rzextr(int iest, REAL xest, REAL yest[], REAL yz[], REAL dy[], int nv);
+void savgol(REAL c[], int np, int nl, int nr, int ld, int m);
+void score(REAL xf, REAL y[], REAL f[]);
+void scrsho(REAL (*fx)(REAL));
+REAL select_(unsigned long k, unsigned long n, REAL arr[]);
+REAL selip(unsigned long k, unsigned long n, REAL arr[]);
+void shell(unsigned long n, REAL a[]);
+void shoot(int n, REAL v[], REAL f[]);
+void shootf(int n, REAL v[], REAL f[]);
+void simp1(REAL **a, int mm, int ll[], int nll, int iabf, int *kp,
+	REAL *bmax);
+void simp2(REAL **a, int n, int l2[], int nl2, int *ip, int kp, REAL *q1);
+void simp3(REAL **a, int i1, int k1, int ip, int kp);
+void simplx(REAL **a, int m, int n, int m1, int m2, int m3, int *icase,
+	int izrov[], int iposv[]);
+void simpr(REAL y[], REAL dydx[], REAL dfdx[], REAL **dfdy,
+	int n, REAL xs, REAL htot, int nstep, REAL yout[],
+	void (*derivs)(REAL, REAL [], REAL []));
+void sinft(REAL y[], int n);
+void slvsm2(double **u, double **rhs);
+void slvsml(double **u, double **rhs);
+void sncndn(REAL uu, REAL emmc, REAL *sn, REAL *cn, REAL *dn);
+double snrm(unsigned long n, double sx[], int itol);
+void sobseq(int *n, REAL x[]);
+void solvde(int itmax, REAL conv, REAL slowc, REAL scalv[],
+	int indexv[], int ne, int nb, int m, REAL **y, REAL ***c, REAL **s);
+void sor(double **a, double **b, double **c, double **d, double **e,
+	double **f, double **u, int jmax, double rjac);
+void sort(unsigned long n, REAL arr[]);
+void sort2(unsigned long n, REAL arr[], REAL brr[]);
+void sort3(unsigned long n, REAL ra[], REAL rb[], REAL rc[]);
+void spctrm(FILE *fp, REAL p[], int m, int k, int ovrlap);
+void spear(REAL data1[], REAL data2[], unsigned long n, REAL *d, REAL *zd,
+	REAL *probd, REAL *rs, REAL *probrs);
+void sphbes(int n, REAL x, REAL *sj, REAL *sy, REAL *sjp, REAL *syp);
+void splie2(REAL x1a[], REAL x2a[], REAL **ya, int m, int n, REAL **y2a);
+void splin2(REAL x1a[], REAL x2a[], REAL **ya, REAL **y2a, int m, int n,
+	REAL x1, REAL x2, REAL *y);
+void spline(REAL x[], REAL y[], int n, REAL yp1, REAL ypn, REAL y2[]);
+void splint(REAL xa[], REAL ya[], REAL y2a[], int n, REAL x, REAL *y);
+void spread(REAL y, REAL yy[], unsigned long n, REAL x, int m);
+void sprsax(REAL sa[], unsigned long ija[], REAL x[], REAL b[],
+	unsigned long n);
+void sprsin(REAL **a, int n, REAL thresh, unsigned long nmax, REAL sa[],
+	unsigned long ija[]);
+void sprspm(REAL sa[], unsigned long ija[], REAL sb[], unsigned long ijb[],
+	REAL sc[], unsigned long ijc[]);
+void sprstm(REAL sa[], unsigned long ija[], REAL sb[], unsigned long ijb[],
+	REAL thresh, unsigned long nmax, REAL sc[], unsigned long ijc[]);
+void sprstp(REAL sa[], unsigned long ija[], REAL sb[], unsigned long ijb[]);
+void sprstx(REAL sa[], unsigned long ija[], REAL x[], REAL b[],
+	unsigned long n);
+void stifbs(REAL y[], REAL dydx[], int nv, REAL *xx,
+	REAL htry, REAL eps, REAL yscal[], REAL *hdid, REAL *hnext,
+	void (*derivs)(REAL, REAL [], REAL []));
+void stiff(REAL y[], REAL dydx[], int n, REAL *x,
+	REAL htry, REAL eps, REAL yscal[], REAL *hdid, REAL *hnext,
+	void (*derivs)(REAL, REAL [], REAL []));
+void stoerm(REAL y[], REAL d2y[], int nv, REAL xs,
+	REAL htot, int nstep, REAL yout[],
+	void (*derivs)(REAL, REAL [], REAL []));
+void svbksb(REAL **u, REAL w[], REAL **v, int m, int n, REAL b[],
+	REAL x[]);
+void svdcmp(REAL **a, int m, int n, REAL w[], REAL **v);
+void svdfit(REAL x[], REAL y[], REAL sig[], int ndata, REAL a[],
+	int ma, REAL **u, REAL **v, REAL w[], REAL *chisq,
+	void (*funcs)(REAL, REAL [], int));
+void svdvar(REAL **v, int ma, REAL w[], REAL **cvm);
+void toeplz(REAL r[], REAL x[], REAL y[], int n);
+void tptest(REAL data1[], REAL data2[], unsigned long n, REAL *t, REAL *prob);
+void tqli(REAL d[], REAL e[], int n, REAL **z);
+REAL trapzd(REAL (*func)(REAL), REAL a, REAL b, int n);
+void tred2(REAL **a, int n, REAL d[], REAL e[]);
+void tridag(REAL a[], REAL b[], REAL c[], REAL r[], REAL u[],
+	unsigned long n);
+REAL trncst(REAL x[], REAL y[], int iorder[], int ncity, int n[]);
+void trnspt(int iorder[], int ncity, int n[]);
+void ttest(REAL data1[], unsigned long n1, REAL data2[], unsigned long n2,
+	REAL *t, REAL *prob);
+void tutest(REAL data1[], unsigned long n1, REAL data2[], unsigned long n2,
+	REAL *t, REAL *prob);
+void twofft(REAL data1[], REAL data2[], REAL fft1[], REAL fft2[],
+	unsigned long n);
+void vander(double x[], double w[], double q[], int n);
+void vegas(REAL regn[], int ndim, REAL (*fxn)(REAL [], REAL), int init,
+	unsigned long ncall, int itmx, int nprn, REAL *tgral, REAL *sd,
+	REAL *chi2a);
+void voltra(int n, int m, REAL t0, REAL h, REAL *t, REAL **f,
+	REAL (*g)(int, REAL), REAL (*ak)(int, int, REAL, REAL));
+void wt1(REAL a[], unsigned long n, int isign,
+	void (*wtstep)(REAL [], unsigned long, int));
+void wtn(REAL a[], unsigned long nn[], int ndim, int isign,
+	void (*wtstep)(REAL [], unsigned long, int));
+void wwghts(REAL wghts[], int n, REAL h,
+	void (*kermom)(double [], double ,int));
+int zbrac(REAL (*func)(REAL), REAL *x1, REAL *x2);
+void zbrak(REAL (*fx)(REAL), REAL x1, REAL x2, int n, REAL xb1[],
+	REAL xb2[], int *nb);
+REAL zbrent(REAL (*func)(REAL), REAL x1, REAL x2, REAL tol);
+void zrhqr(REAL a[], int m, REAL rtr[], REAL rti[]);
+REAL zriddr(REAL (*func)(REAL), REAL x1, REAL x2, REAL xacc);
+void zroots(fcomplex a[], int m, fcomplex roots[], int polish);
+
+#endif /* _NR_H_ */

+ 295 - 0
examples/pastix-wrappers/models/num_recipes/nrutil.c

@@ -0,0 +1,295 @@
+/* CAUTION: This is the ANSI C (only) version of the Numerical Recipes
+   utility file nrutil.c.  Do not confuse this file with the same-named
+   file nrutil.c that is supplied in the 'misc' subdirectory.
+   *That* file is the one from the book, and contains both ANSI and
+   traditional K&R versions, along with #ifdef macros to select the
+   correct version.  *This* file contains only ANSI C.               */
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#define NR_END 1
+#define FREE_ARG char*
+#define REAL float
+
+
+void nrerror(char error_text[])
+/* Numerical Recipes standard error handler */
+{
+	fprintf(stderr,"Numerical Recipes run-time error...\n");
+	fprintf(stderr,"%s\n",error_text);
+	fprintf(stderr,"...now exiting to system...\n");
+	exit(1);
+}
+
+REAL *vector(long nl, long nh)
+/* allocate a REAL vector with subscript range v[nl..nh] */
+{
+	REAL *v;
+
+	v=(REAL *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(REAL)));
+	if (!v) nrerror("allocation failure in vector()");
+	return v-nl+NR_END;
+}
+
+int *ivector(long nl, long nh)
+/* allocate an int vector with subscript range v[nl..nh] */
+{
+	int *v;
+
+	v=(int *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(int)));
+	if (!v) nrerror("allocation failure in ivector()");
+	return v-nl+NR_END;
+}
+
+unsigned char *cvector(long nl, long nh)
+/* allocate an unsigned char vector with subscript range v[nl..nh] */
+{
+	unsigned char *v;
+
+	v=(unsigned char *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(unsigned char)));
+	if (!v) nrerror("allocation failure in cvector()");
+	return v-nl+NR_END;
+}
+
+unsigned long *lvector(long nl, long nh)
+/* allocate an unsigned long vector with subscript range v[nl..nh] */
+{
+	unsigned long *v;
+
+	v=(unsigned long *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(long)));
+	if (!v) nrerror("allocation failure in lvector()");
+	return v-nl+NR_END;
+}
+
+double *dvector(long nl, long nh)
+/* allocate a double vector with subscript range v[nl..nh] */
+{
+	double *v;
+
+	v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
+	if (!v) nrerror("allocation failure in dvector()");
+	return v-nl+NR_END;
+}
+
+REAL **matrix(long nrl, long nrh, long ncl, long nch)
+/* allocate a REAL matrix with subscript range m[nrl..nrh][ncl..nch] */
+{
+	long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
+	REAL **m;
+
+	/* allocate pointers to rows */
+	m=(REAL **) malloc((size_t)((nrow+NR_END)*sizeof(REAL*)));
+	if (!m) nrerror("allocation failure 1 in matrix()");
+	m += NR_END;
+	m -= nrl;
+
+	/* allocate rows and set pointers to them */
+	m[nrl]=(REAL *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(REAL)));
+	if (!m[nrl]) nrerror("allocation failure 2 in matrix()");
+	m[nrl] += NR_END;
+	m[nrl] -= ncl;
+
+	for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
+
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+double **dmatrix(long nrl, long nrh, long ncl, long nch)
+/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
+{
+	long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
+	double **m;
+
+	/* allocate pointers to rows */
+	m=(double **) malloc((size_t)((nrow+NR_END)*sizeof(double*)));
+	if (!m) nrerror("allocation failure 1 in matrix()");
+	m += NR_END;
+	m -= nrl;
+
+	/* allocate rows and set pointers to them */
+	m[nrl]=(double *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
+	if (!m[nrl]) nrerror("allocation failure 2 in matrix()");
+	m[nrl] += NR_END;
+	m[nrl] -= ncl;
+
+	for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
+
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+int **imatrix(long nrl, long nrh, long ncl, long nch)
+/* allocate a int matrix with subscript range m[nrl..nrh][ncl..nch] */
+{
+	long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
+	int **m;
+
+	/* allocate pointers to rows */
+	m=(int **) malloc((size_t)((nrow+NR_END)*sizeof(int*)));
+	if (!m) nrerror("allocation failure 1 in matrix()");
+	m += NR_END;
+	m -= nrl;
+
+
+	/* allocate rows and set pointers to them */
+	m[nrl]=(int *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(int)));
+	if (!m[nrl]) nrerror("allocation failure 2 in matrix()");
+	m[nrl] += NR_END;
+	m[nrl] -= ncl;
+
+	for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
+
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+REAL **submatrix(REAL **a, long oldrl, long oldrh, long oldcl, long oldch,
+	long newrl, long newcl)
+/* point a submatrix [newrl..][newcl..] to a[oldrl..oldrh][oldcl..oldch] */
+{
+	long i,j,nrow=oldrh-oldrl+1,ncol=oldcl-newcl;
+	REAL **m;
+
+	/* allocate array of pointers to rows */
+	m=(REAL **) malloc((size_t) ((nrow+NR_END)*sizeof(REAL*)));
+	if (!m) nrerror("allocation failure in submatrix()");
+	m += NR_END;
+	m -= newrl;
+
+	/* set pointers to rows */
+	for(i=oldrl,j=newrl;i<=oldrh;i++,j++) m[j]=a[i]+ncol;
+
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+REAL **convert_matrix(REAL *a, long nrl, long nrh, long ncl, long nch)
+/* allocate a REAL matrix m[nrl..nrh][ncl..nch] that points to the matrix
+declared in the standard C manner as a[nrow][ncol], where nrow=nrh-nrl+1
+and ncol=nch-ncl+1. The routine should be called with the address
+&a[0][0] as the first argument. */
+{
+	long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1;
+	REAL **m;
+
+	/* allocate pointers to rows */
+	m=(REAL **) malloc((size_t) ((nrow+NR_END)*sizeof(REAL*)));
+	if (!m) nrerror("allocation failure in convert_matrix()");
+	m += NR_END;
+	m -= nrl;
+
+	/* set pointers to rows */
+	m[nrl]=a-ncl;
+	for(i=1,j=nrl+1;i<nrow;i++,j++) m[j]=m[j-1]+ncol;
+	/* return pointer to array of pointers to rows */
+	return m;
+}
+
+REAL ***f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh)
+/* allocate a REAL 3tensor with range t[nrl..nrh][ncl..nch][ndl..ndh] */
+{
+	long i,j,nrow=nrh-nrl+1,ncol=nch-ncl+1,ndep=ndh-ndl+1;
+	REAL ***t;
+
+	/* allocate pointers to pointers to rows */
+	t=(REAL ***) malloc((size_t)((nrow+NR_END)*sizeof(REAL**)));
+	if (!t) nrerror("allocation failure 1 in f3tensor()");
+	t += NR_END;
+	t -= nrl;
+
+	/* allocate pointers to rows and set pointers to them */
+	t[nrl]=(REAL **) malloc((size_t)((nrow*ncol+NR_END)*sizeof(REAL*)));
+	if (!t[nrl]) nrerror("allocation failure 2 in f3tensor()");
+	t[nrl] += NR_END;
+	t[nrl] -= ncl;
+
+	/* allocate rows and set pointers to them */
+	t[nrl][ncl]=(REAL *) malloc((size_t)((nrow*ncol*ndep+NR_END)*sizeof(REAL)));
+	if (!t[nrl][ncl]) nrerror("allocation failure 3 in f3tensor()");
+	t[nrl][ncl] += NR_END;
+	t[nrl][ncl] -= ndl;
+
+	for(j=ncl+1;j<=nch;j++) t[nrl][j]=t[nrl][j-1]+ndep;
+	for(i=nrl+1;i<=nrh;i++) {
+		t[i]=t[i-1]+ncol;
+		t[i][ncl]=t[i-1][ncl]+ncol*ndep;
+		for(j=ncl+1;j<=nch;j++) t[i][j]=t[i][j-1]+ndep;
+	}
+
+	/* return pointer to array of pointers to rows */
+	return t;
+}
+
+void free_vector(REAL *v, long nl, long nh)
+/* free a REAL vector allocated with vector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_ivector(int *v, long nl, long nh)
+/* free an int vector allocated with ivector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_cvector(unsigned char *v, long nl, long nh)
+/* free an unsigned char vector allocated with cvector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_lvector(unsigned long *v, long nl, long nh)
+/* free an unsigned long vector allocated with lvector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_dvector(double *v, long nl, long nh)
+/* free a double vector allocated with dvector() */
+{
+	free((FREE_ARG) (v+nl-NR_END));
+}
+
+void free_matrix(REAL **m, long nrl, long nrh, long ncl, long nch)
+/* free a REAL matrix allocated by matrix() */
+{
+	free((FREE_ARG) (m[nrl]+ncl-NR_END));
+	free((FREE_ARG) (m+nrl-NR_END));
+}
+
+void free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch)
+/* free a double matrix allocated by dmatrix() */
+{
+	free((FREE_ARG) (m[nrl]+ncl-NR_END));
+	free((FREE_ARG) (m+nrl-NR_END));
+}
+
+void free_imatrix(int **m, long nrl, long nrh, long ncl, long nch)
+/* free an int matrix allocated by imatrix() */
+{
+	free((FREE_ARG) (m[nrl]+ncl-NR_END));
+	free((FREE_ARG) (m+nrl-NR_END));
+}
+
+void free_submatrix(REAL **b, long nrl, long nrh, long ncl, long nch)
+/* free a submatrix allocated by submatrix() */
+{
+	free((FREE_ARG) (b+nrl-NR_END));
+}
+
+void free_convert_matrix(REAL **b, long nrl, long nrh, long ncl, long nch)
+/* free a matrix allocated by convert_matrix() */
+{
+	free((FREE_ARG) (b+nrl-NR_END));
+}
+
+void free_f3tensor(REAL ***t, long nrl, long nrh, long ncl, long nch,
+	long ndl, long ndh)
+/* free a REAL f3tensor allocated by f3tensor() */
+{
+	free((FREE_ARG) (t[nrl][ncl]+ndl-NR_END));
+	free((FREE_ARG) (t[nrl]+ncl-NR_END));
+	free((FREE_ARG) (t+nrl-NR_END));
+}

+ 79 - 0
examples/pastix-wrappers/models/num_recipes/nrutil.h

@@ -0,0 +1,79 @@
+/* CAUTION: This is the ANSI C (only) version of the Numerical Recipes
+   utility file nrutil.h.  Do not confuse this file with the same-named
+   file nrutil.h that is supplied in the 'misc' subdirectory.
+   *That* file is the one from the book, and contains both ANSI and
+   traditional K&R versions, along with #ifdef macros to select the
+   correct version.  *This* file contains only ANSI C.               */
+
+#ifndef _NR_UTILS_H_
+#define _NR_UTILS_H_
+
+#define REAL float
+
+static REAL sqrarg;
+#define SQR(a) ((sqrarg=(a)) == 0.0 ? 0.0 : sqrarg*sqrarg)
+
+static double dsqrarg;
+#define DSQR(a) ((dsqrarg=(a)) == 0.0 ? 0.0 : dsqrarg*dsqrarg)
+
+static double dmaxarg1,dmaxarg2;
+#define DMAX(a,b) (dmaxarg1=(a),dmaxarg2=(b),(dmaxarg1) > (dmaxarg2) ?\
+        (dmaxarg1) : (dmaxarg2))
+
+static double dminarg1,dminarg2;
+#define DMIN(a,b) (dminarg1=(a),dminarg2=(b),(dminarg1) < (dminarg2) ?\
+        (dminarg1) : (dminarg2))
+
+static REAL maxarg1,maxarg2;
+#define FMAX(a,b) (maxarg1=(a),maxarg2=(b),(maxarg1) > (maxarg2) ?\
+        (maxarg1) : (maxarg2))
+
+static REAL minarg1,minarg2;
+#define FMIN(a,b) (minarg1=(a),minarg2=(b),(minarg1) < (minarg2) ?\
+        (minarg1) : (minarg2))
+
+static long lmaxarg1,lmaxarg2;
+#define LMAX(a,b) (lmaxarg1=(a),lmaxarg2=(b),(lmaxarg1) > (lmaxarg2) ?\
+        (lmaxarg1) : (lmaxarg2))
+
+static long lminarg1,lminarg2;
+#define LMIN(a,b) (lminarg1=(a),lminarg2=(b),(lminarg1) < (lminarg2) ?\
+        (lminarg1) : (lminarg2))
+
+static int imaxarg1,imaxarg2;
+#define IMAX(a,b) (imaxarg1=(a),imaxarg2=(b),(imaxarg1) > (imaxarg2) ?\
+        (imaxarg1) : (imaxarg2))
+
+static int iminarg1,iminarg2;
+#define IMIN(a,b) (iminarg1=(a),iminarg2=(b),(iminarg1) < (iminarg2) ?\
+        (iminarg1) : (iminarg2))
+
+#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
+
+void nrerror(char error_text[]);
+REAL *vector(long nl, long nh);
+int *ivector(long nl, long nh);
+unsigned char *cvector(long nl, long nh);
+unsigned long *lvector(long nl, long nh);
+double *dvector(long nl, long nh);
+REAL **matrix(long nrl, long nrh, long ncl, long nch);
+double **dmatrix(long nrl, long nrh, long ncl, long nch);
+int **imatrix(long nrl, long nrh, long ncl, long nch);
+REAL **submatrix(REAL **a, long oldrl, long oldrh, long oldcl, long oldch,
+	long newrl, long newcl);
+REAL **convert_matrix(REAL *a, long nrl, long nrh, long ncl, long nch);
+REAL ***f3tensor(long nrl, long nrh, long ncl, long nch, long ndl, long ndh);
+void free_vector(REAL *v, long nl, long nh);
+void free_ivector(int *v, long nl, long nh);
+void free_cvector(unsigned char *v, long nl, long nh);
+void free_lvector(unsigned long *v, long nl, long nh);
+void free_dvector(double *v, long nl, long nh);
+void free_matrix(REAL **m, long nrl, long nrh, long ncl, long nch);
+void free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch);
+void free_imatrix(int **m, long nrl, long nrh, long ncl, long nch);
+void free_submatrix(REAL **b, long nrl, long nrh, long ncl, long nch);
+void free_convert_matrix(REAL **b, long nrl, long nrh, long ncl, long nch);
+void free_f3tensor(REAL ***t, long nrl, long nrh, long ncl, long nch,
+	long ndl, long ndh);
+
+#endif /* _NR_UTILS_H_ */

+ 141 - 0
examples/pastix-wrappers/models/reg_gemm.c

@@ -0,0 +1,141 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "nr.h"
+
+#define REAL float
+#define SIZE 2000000
+#define ma 6
+
+
+typedef struct Coord
+{
+  REAL x1;
+  REAL x2;
+  REAL x3;
+} coord, *pcoord;
+
+
+coord tabcoord[SIZE];
+REAL tabx[SIZE];
+REAL taby[SIZE];  
+REAL sig[SIZE];
+REAL afunc[ma+1];
+int ia[ma+1];
+REAL a[ma+1];
+
+
+void funcs( REAL i, REAL afunc[ma+1], int ma2)
+{
+  
+  afunc[1]= 1;
+  afunc[2]= tabcoord[(int)i].x1;
+  afunc[3]= tabcoord[(int)i].x2;
+  afunc[4]= tabcoord[(int)i].x1*tabcoord[(int)i].x2;
+  afunc[5]= tabcoord[(int)i].x2*tabcoord[(int)i].x3;
+  afunc[6]= tabcoord[(int)i].x1*tabcoord[(int)i].x2*tabcoord[(int)i].x3;
+  //printf("%f %f %f \n",afunc[0],afunc[1],afunc[2]);
+}
+
+
+int main(int argc, char * argv[])
+{
+  REAL total=0.0;
+  REAL ecart=0.0;
+  int len=0;
+  char str2[1000];
+/*   long double total=0.0; */
+/*   long double ecart=0.0; */
+  char *filename = argv[1];
+  char perf_h[1000];
+  char str[1000];
+  int k,i;
+  FILE * res;
+  FILE *out;
+  FILE *perf;
+  // FILE *tmpperf;
+  int ndat=atoi(argv[2]);
+  REAL ** covar;
+  REAL *chisq = (REAL *)malloc(sizeof(REAL));
+  res = fopen(filename,"r");
+
+  covar = (REAL**) malloc((ma+1) *sizeof(REAL*));
+  for (i=0;i<ma+1;i++)
+    covar[i]=(REAL*)malloc((ma+1) *sizeof(REAL));
+
+
+  for (k=1;k<ndat+1;k++)
+    {
+      int x0, y0, x1, y1, x2, y2;
+      REAL tmpfloat;
+      fscanf(res,"%f\t%d\t%d\t%d\t%d\t%d\t%d\n", &tmpfloat, &x0, &y0, &x1, &y1, &x2, &y2);
+      tabcoord[k].x1= x0 - y0; 
+      tabcoord[k].x2= y2;
+      tabcoord[k].x3= y0;
+      taby[k]=tmpfloat;
+      sig[k]=1;
+      tabx[k]=k;
+      //fprintf(out,"%f %f %f\n",tabcoord[k].x1 ,tabcoord[k].x2 ,tabcoord[k].x3);
+      //fprintf(out,"%f %f\n",tabx[k] , taby[k]);
+    }
+  for (k=1;k<ma+1;k++)
+    ia[k]=1;
+
+
+  lfit(tabx, taby, sig, ndat, a, ia, ma, covar, chisq, &funcs);
+  
+  for (k=1;k<ma+1;k++)  
+    {    
+//      printf("%.12lf\n", a[k]);
+      //total+=a[k];
+    }
+
+
+
+  //calcul de l'ecart type
+  for (k=1;k<ndat+1;k++)
+    {
+      double abs=0.0;
+      abs += a[1];
+      abs += tabcoord[k].x1*a[2]+tabcoord[k].x2*a[3];
+      abs += tabcoord[k].x1*tabcoord[k].x2*a[4]+tabcoord[k].x2*tabcoord[k].x3*a[5];
+      abs += tabcoord[k].x1*tabcoord[k].x2*tabcoord[k].x3*a[6];
+    //  fprintf(stderr,"k=%i ; calcul : %lf ; reel : %lf ; ", k, abs, taby[k]);
+      abs = abs - taby[k];
+      if (abs < 0)
+	abs = - abs;
+    //  fprintf(stderr,"ecart : %lf\n ", abs);
+
+      total += abs;
+      //printf("%f %f %f\n",tabcoord[k].x1 ,tabcoord[k].x2 ,tabcoord[k].x3);
+    }
+
+
+
+  fprintf(stdout,"#define GEMM_A  %e\n#define GEMM_B  %e\n#define GEMM_C  %e\n#define GEMM_D  %e\n#define GEMM_E  %e\n#define GEMM_F  %e\n",a[6],a[4],a[5],a[2],a[3],a[1]);
+  fprintf(stderr,"#define PERF_GEMM(i,j,k) (GEMM_A*(double)(i)*(double)(j)*(double)(k)+GEMM_B*(double)(i)*(double)(j)+GEMM_C*(double)(j)*(double)(k)+GEMM_D*(double)(i)+GEMM_E*(double)(j)+GEMM_F)\n");
+
+
+  fprintf(stderr, "total %lf\n", total);
+  ecart = total / ndat;  
+  fprintf(stderr, "ecart moyen %lf\n", ecart);
+
+  return 0;
+}

+ 122 - 0
examples/pastix-wrappers/models/reg_trsm.c

@@ -0,0 +1,122 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "nr.h"
+
+#define SIZE 100000
+#define ma 3
+
+
+typedef struct Coord
+{
+  float x1;
+  float x2;
+} coord, *pcoord;
+
+
+coord tabcoord[SIZE];
+float tabx[SIZE];
+float taby[SIZE];  
+float sig[SIZE];
+float afunc[ma+1];
+int ia[ma+1];
+float a[ma+1];
+
+
+void funcs( float i, float afunc[ma+1], int ma2)
+{
+  
+  afunc[1]=1;
+  afunc[2]=tabcoord[(int)i].x1;
+  afunc[3]=tabcoord[(int)i].x1*tabcoord[(int)i].x1*tabcoord[(int)i].x2;
+  
+  //printf("%f %f %f \n",afunc[0],afunc[1],afunc[2]);
+}
+
+
+int main(int argc, char * argv[])
+{
+  float total=0.0;
+  float ecart=0.0;
+/*   long double total=0.0; */
+/*   long double ecart=0.0; */
+  char *filename = argv[1];
+  int k,i;
+  FILE * res;
+  int ndat=atoi(argv[2]);
+  float ** covar;
+  float *chisq = (float *)malloc(sizeof(float));
+  res = fopen(filename,"r");
+  covar = (float**) malloc((ma+1) *sizeof(float*));
+  for (i=0;i<ma+1;i++)
+    covar[i]=(float*)malloc((ma+1) *sizeof(float));
+
+  for (k=1;k<ndat+1;k++)
+    {
+      int i, j;
+      float tmpfloat;
+      fscanf(res,"%f\t%d\t%d\n", &tmpfloat, &i, &j);
+      tabcoord[k].x1=i-j; 
+      tabcoord[k].x2=j;
+      taby[k]=tmpfloat;
+//      printf("%d -> %f %d %d\n", k, tmpfloat, i-j, j);
+      sig[k]=1;
+      tabx[k]=k;
+    }
+  for (k=1;k<ma+1;k++)
+    ia[k]=1;
+  
+  lfit(tabx, taby, sig, ndat, a, ia, ma, covar, chisq, &funcs);
+  
+  for (k=1;k<ma+1;k++)  
+    {    
+  //    printf("%.12lf\n", a[k]);
+      //total+=a[k];
+    }
+
+
+
+  //calcul de l'ecart type
+  for (k=1;k<ndat+1;k++)
+    {
+      double abs=0.0;
+      abs += a[1];
+      abs += tabcoord[k].x1*a[2];
+      abs += tabcoord[k].x1*tabcoord[k].x1*tabcoord[k].x2*a[3];
+//      fprintf(stderr,"k=%i ; calcul : %lf ; reel : %lf ; ", k, abs, taby[k]);
+      abs = abs - taby[k];
+      if (abs < 0)
+	abs = - abs;
+ //     fprintf(stderr,"ecart : %lf\n ", abs);
+
+      total += abs;
+      //printf("%f %f %f\n",tabcoord[k].x1 ,tabcoord[k].x2 ,tabcoord[k].x3);
+    }
+  
+  fprintf(stdout,"#define TRSM_A %e\n#define TRSM_B %e\n#define TRSM_C %e\n", a[3], a[2], a[1]);
+  fprintf(stderr,"#define PERF_TRSM(i,j)   (TRSM_A*(double)(i)*(double)(i)*(double)(j)+TRSM_B*(double)(i)+TRSM_C)\n");
+
+
+  fprintf(stderr, "total %lf\n", total);
+  ecart = total / ndat;  
+  fprintf(stderr, "ecart moyen %lf\n", ecart);
+
+  return 0;
+}

+ 755 - 0
examples/pastix-wrappers/starpu-blas-wrapper.c

@@ -0,0 +1,755 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <semaphore.h>
+#include <core/jobs.h>
+#include <core/workers.h>
+#include <core/dependencies/tags.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <ctype.h>
+#include <pthread.h>
+#include <signal.h>
+#include <cblas.h>
+
+#include <datawizard/datawizard.h>
+#include <task-models/blas_model.h>
+#include <common/fxt.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+#define BLOCK	75
+
+#include "starpu-blas-wrapper.h"
+
+extern struct data_interface_ops_t interface_blas_ops;
+
+static int core_sgemm = 0;
+static int cublas_sgemm = 0;
+static int core_strsm = 0;
+static int cublas_strsm = 0;
+
+static int inited = 0;
+
+void STARPU_INIT(void)
+{
+	if (!inited) {
+		inited = 1;
+		starpu_init();	
+	}
+}
+
+void STARPU_TERMINATE(void)
+{
+	starpu_shutdown();
+
+	fprintf(stderr, "sgemm : core %d cublas %d\n", core_sgemm, cublas_sgemm);
+	fprintf(stderr, "strsm : core %d cublas %d\n", core_strsm, cublas_strsm);
+}
+
+/*
+ *
+ *	Specific to PaStiX !
+ *
+ */
+
+/*
+ *	
+ *	We "need" some custom filters
+ *
+ *			 VECTOR
+ *			  (n)
+ *			/  |   \		
+ * 		   VECTOR  BLAS  VECTOR
+ * 		    (n1)  (n2)	 
+ *	
+ *	if n1 = 0 :
+ * 			VECTOR
+ *			/   \
+ *		     BLAS  VECTOR
+ */
+
+struct divide_vector_in_blas_filter_args {
+	uint32_t n1, n2; /* (total size of the first portion (vector length) n < root's n ! */
+	uint32_t stride; /* stride of the first portion (need to be a multiple of n */
+};
+
+unsigned divide_vector_in_blas_filter(starpu_filter *f, starpu_data_handle root_data)
+{
+	starpu_vector_interface_t *vector_root = &root_data->interface[0].vector;
+		uint32_t nx = vector_root->nx;
+		size_t elemsize = vector_root->elemsize;
+
+	struct divide_vector_in_blas_filter_args *args = f->filter_arg_ptr;
+		unsigned n1 = args->n1;
+		unsigned n2 = args->n2;
+		unsigned stride = args->stride;
+		STARPU_ASSERT(n1 + n2 < nx);
+		unsigned n3 = nx - n1 - n2;
+		
+
+	/* first allocate the children starpu_data_handle */
+	root_data->children = calloc((n1==0)?2:3, sizeof(starpu_data_handle));
+	STARPU_ASSERT(root_data->children);
+
+	STARPU_ASSERT((n2 % args->stride) == 0);
+
+	unsigned child = 0;
+	unsigned node;
+	
+	if (n1 > 0)
+	{
+		for (node = 0; node < MAXNODES; node++)
+		{
+			starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector;
+	
+			local->nx = n1;
+			local->elemsize = elemsize;
+	
+			if (root_data->per_node[node].allocated) {
+				local->ptr = root_data->interface[node].vector.ptr;
+			}
+	
+		}
+
+		child++;
+	}
+	
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_blas_interface_t *local = &root_data->children[child].interface[node].blas;
+
+		local->nx = stride;
+		local->ny = n2/stride;
+		local->ld = stride;
+		local->elemsize = elemsize;
+
+		if (root_data->per_node[node].allocated) {
+			local->ptr = root_data->interface[node].vector.ptr + n1*elemsize;
+		}
+
+		struct starpu_data_state_t *state = &root_data->children[child];
+		state->ops = &interface_blas_ops;
+	}
+
+	child++;
+
+	for (node = 0; node < MAXNODES; node++)
+	{
+		starpu_vector_interface_t *local = &root_data->children[child].interface[node].vector;
+
+		local->nx = n3;
+		local->elemsize = elemsize;
+
+		if (root_data->per_node[node].allocated) {
+			local->ptr = root_data->interface[node].vector.ptr + (n1+n2)*elemsize;
+		}
+	}
+
+	return (n1==0)?2:3;
+}
+
+
+static data_state *cblktab;
+
+static void _cublas_cblk_strsm_callback(void *sem)
+{
+	sem_t *semptr = sem;
+	sem_post(semptr);
+}
+
+
+void STARPU_MONITOR_DATA(unsigned ncols)
+{
+	cblktab = calloc(ncols, sizeof(data_state));
+}
+
+void STARPU_MONITOR_CBLK(unsigned col, float *data, unsigned stride, unsigned width)
+{
+	//void starpu_monitor_blas_data(struct starpu_data_state_t *state, uint32_t home_node,
+        //                uintptr_t ptr, uint32_t ld, uint32_t nx,
+        //                uint32_t ny, size_t elemsize);
+
+	//fprintf(stderr, "col %d data %p stride %d width %d\n", col, data, stride, width);
+
+	starpu_monitor_blas_data(&cblktab[col], 0 /* home */,
+			(uintptr_t) data, stride, stride, width, sizeof(float));
+	
+}
+
+static data_state work_block_1;
+static data_state work_block_2;
+
+void allocate_maxbloktab_on_cublas(starpu_data_interface_t *descr __attribute__((unused)), void *arg __attribute__((unused)))
+{
+	request_data_allocation(&work_block_1, 1);
+	request_data_allocation(&work_block_2, 1);
+
+
+	starpu_filter f1, f2;
+	struct divide_vector_in_blas_filter_args args1, args2;
+
+	f1.filter_func = divide_vector_in_blas_filter;
+		args1.n1 = 1; /* XXX random ... */
+		args1.n2 = 2;
+		args1.stride = 1;
+
+	f1.filter_arg_ptr = &args1;
+	starpu_partition_data(&work_block_1, &f1);
+
+	f2.filter_func = divide_vector_in_blas_filter;
+		args2.n1 = 0;
+		args2.n2 = 2;
+		args2.stride = 1;
+	f2.filter_arg_ptr = &args2;
+	starpu_partition_data(&work_block_2, &f2);
+}
+
+void STARPU_DECLARE_WORK_BLOCKS(float *maxbloktab1, float *maxbloktab2, unsigned solv_coefmax)
+{
+	starpu_monitor_vector_data(&work_block_1, 0 /* home */, (uintptr_t)maxbloktab1, solv_coefmax, sizeof(float));
+	starpu_monitor_vector_data(&work_block_2, 0 /* home */, (uintptr_t)maxbloktab2, solv_coefmax, sizeof(float));
+
+	starpu_codelet cl;
+	job_t j;
+	sem_t sem;
+
+	/* initialize codelet */
+	cl.where = CUBLAS;
+	cl.cublas_func = allocate_maxbloktab_on_cublas;
+	
+	j = job_create();
+	j->cb = _cublas_cblk_strsm_callback;
+	j->argcb = &sem;
+	j->cl = &cl;
+	j->cl_arg = NULL;
+
+	j->nbuffers = 0;
+	j->cl->model = NULL;
+
+	sem_init(&sem, 0, 0U);
+	
+	/* submit the codelet */
+	submit_job(j);
+
+	/* wait for its completion */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+}
+
+void _core_cblk_strsm(starpu_data_interface_t *descr, void *arg __attribute__((unused)))
+{
+	uint32_t nx, ny, ld;
+	nx = descr[0].blas.nx;
+	ny = descr[0].blas.ny;
+	ld = descr[0].blas.ld;
+
+	float *diag_cblkdata, *extra_cblkdata;
+	diag_cblkdata = (float *)descr[0].blas.ptr;
+	extra_cblkdata = diag_cblkdata + ny;
+
+	unsigned m = nx - ny;
+	unsigned n = ny;
+
+//	SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride);
+	core_strsm++;
+
+	cblas_strsm(CblasColMajor, CblasRight, CblasLower, CblasTrans, CblasUnit, m, n, 1.0f, 
+			diag_cblkdata, ld, extra_cblkdata, ld);
+}
+
+
+void _cublas_cblk_strsm(starpu_data_interface_t *descr, void *arg __attribute__((unused)))
+{
+	uint32_t nx, ny, ld;
+	nx = descr[0].blas.nx;
+	ny = descr[0].blas.ny;
+	ld = descr[0].blas.ld;
+
+	float *diag_cblkdata, *extra_cblkdata;
+	diag_cblkdata = (float *)descr[0].blas.ptr;
+	extra_cblkdata = diag_cblkdata + ny;
+
+	unsigned m = nx - ny;
+	unsigned n = ny;
+	
+	cublas_strsm++;
+
+	cublasStrsm ('R', 'L', 'T', 'U', m, n, 1.0, 
+		diag_cblkdata, ld, 
+		extra_cblkdata, ld);
+	cublasStatus st = cublasGetError();
+	if (st) fprintf(stderr, "ERROR %d\n", st);
+	STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
+}
+
+static struct starpu_perfmodel_t starpu_cblk_strsm = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = starpu_cblk_strsm_core_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = starpu_cblk_strsm_cuda_cost }
+	},
+//	.type = REGRESSION_BASED,
+	.type = PER_ARCH,
+	.symbol = "starpu_cblk_strsm"
+};
+
+
+void STARPU_CBLK_STRSM(unsigned col)
+{
+	/* perform a strsm on the block column */
+	starpu_codelet cl;
+	job_t j;
+	sem_t sem;
+
+	/* initialize codelet */
+	cl.where = CORE|CUBLAS;
+	cl.core_func = _core_cblk_strsm;
+	cl.cublas_func = _cublas_cblk_strsm;
+	
+	j = job_create();
+//	j->where = (starpu_get_blas_nx(&cblktab[col]) > BLOCK && starpu_get_blas_ny(&cblktab[col]) > BLOCK)? CUBLAS:CORE;
+	j->cb = _cublas_cblk_strsm_callback;
+	j->argcb = &sem;
+	j->cl = &cl;
+	j->cl_arg = NULL;
+
+	j->nbuffers = 1;
+	/* we could be a little more precise actually */
+	j->buffers[0].state = &cblktab[col];
+	j->buffers[0].mode = RW;
+	
+	j->cl->model = &starpu_cblk_strsm;
+
+	sem_init(&sem, 0, 0U);
+	
+	/* submit the codelet */
+	submit_job(j);
+
+	/* wait for its completion */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+}
+
+struct starpu_compute_contrib_compact_args {
+	unsigned stride;
+	int dimi;
+	int dimj;
+	int dima;
+};
+
+
+void _core_compute_contrib_compact(starpu_data_interface_t *descr, void *arg)
+{
+	struct starpu_compute_contrib_compact_args *args = arg;
+
+	float *gaik = (float *)descr[0].blas.ptr + args->dima;
+	float *gb = (float *)descr[1].blas.ptr; 
+	unsigned strideb = (unsigned)descr[1].blas.ld;
+	float *gc = (float *)descr[2].blas.ptr;
+	unsigned stridec = (unsigned)descr[2].blas.ld;
+
+	core_sgemm++;
+
+	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 
+			args->dimi, args->dimj, args->dima,
+			1.0f, gaik, args->stride,
+			      gb, strideb,
+			0.0 , gc, stridec);
+
+}
+
+
+void _cublas_compute_contrib_compact(starpu_data_interface_t *descr, void *arg)
+{
+	struct starpu_compute_contrib_compact_args *args = arg;
+
+	float *gaik = (float *)descr[0].blas.ptr + args->dima;
+	float *gb = (float *)descr[1].blas.ptr;
+	unsigned strideb = (unsigned)descr[1].blas.ld;
+	float *gc = (float *)descr[2].blas.ptr;
+	unsigned stridec = (unsigned)descr[2].blas.ld;
+	
+	cublas_sgemm++;
+
+	cublasSgemm('N','T', args->dimi, args->dimj, args->dima, 
+			1.0, gaik, args->stride,
+			     gb, strideb,
+			0.0, gc, stridec);
+
+	cublasStatus st = cublasGetError();
+	if (st) fprintf(stderr, "ERROR %d\n", st);
+	STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
+}
+
+
+static struct starpu_perfmodel_t starpu_compute_contrib_compact = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_core_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = starpu_compute_contrib_compact_cuda_cost }
+	},
+//	.type = REGRESSION_BASED,
+	.type = PER_ARCH,
+	.symbol = "starpu_compute_contrib_compact"
+};
+
+int update_work_blocks(unsigned col, int dimi, int dimj, int dima, int stride)
+{
+	/* be paranoid XXX */
+	notify_data_modification(get_sub_data(&work_block_1, 1, 0), 0);
+	notify_data_modification(get_sub_data(&work_block_1, 1, 1), 0);
+	//notify_data_modification(get_sub_data(&work_block_1, 1, 2), 0);
+	notify_data_modification(get_sub_data(&work_block_2, 1, 0), 0);
+	notify_data_modification(get_sub_data(&work_block_2, 1, 1), 0);
+	notify_data_modification(&cblktab[col], 0);
+
+	starpu_unpartition_data(&work_block_1, 0);
+	starpu_unpartition_data(&work_block_2, 0);
+
+	starpu_filter f1, f2;
+	struct divide_vector_in_blas_filter_args args1, args2;
+
+	f1.filter_func = divide_vector_in_blas_filter;
+		args1.n1 = stride - dima - dimi; //STARPU_ASSERT(args1.n1 != 0);
+		args1.n2 = (stride - dima)*dima;
+		args1.stride = (stride - dima);
+
+	f1.filter_arg_ptr = &args1;
+	starpu_partition_data(&work_block_1, &f1);
+
+	f2.filter_func = divide_vector_in_blas_filter;
+		args2.n1 = 0;
+		args2.n2 = dimi*dimj;
+		args2.stride = dimi;
+	f2.filter_arg_ptr = &args2;
+	starpu_partition_data(&work_block_2, &f2);
+
+	return (args1.n1!=0)?3:2;
+}
+
+void STARPU_COMPUTE_CONTRIB_COMPACT(unsigned col, int dimi, int dimj, int dima, int stride)
+{
+//        CUBLAS_SGEMM("N","T",dimi,dimj,dima, 1.0,gaik,stride,gb,stride-dima,
+//               0.0 ,gc,dimi);
+	
+	struct starpu_compute_contrib_compact_args args;
+		args.stride = stride;
+		args.dimi = dimi;
+		args.dimj = dimj;
+		args.dima = dima;
+
+	starpu_codelet cl;
+	job_t j;
+	sem_t sem;
+
+	/* initialize codelet */
+	cl.where = CUBLAS|CORE;
+	cl.core_func = _core_compute_contrib_compact;
+	cl.cublas_func = _cublas_compute_contrib_compact;
+	
+	j = job_create();
+
+	j->cb = _cublas_cblk_strsm_callback;
+	j->argcb = &sem;
+	j->cl = &cl;
+	j->cl_arg = &args;
+	j->cl->model = &starpu_compute_contrib_compact;
+
+	int ret;
+	ret = update_work_blocks(col, dimi, dimj, dima, stride);
+
+	j->nbuffers = 3;
+	/* we could be a little more precise actually */
+	j->buffers[0].state = &cblktab[col]; // gaik
+	j->buffers[0].mode = R;
+	j->buffers[1].state = get_sub_data(&work_block_1, 1, (ret==2)?0:1);
+	j->buffers[1].mode = R;
+	j->buffers[2].state = get_sub_data(&work_block_2, 1, 0);; 
+	j->buffers[2].mode = RW; // XXX W
+	
+	sem_init(&sem, 0, 0U);
+	
+	/* submit the codelet */
+	submit_job(j);
+
+	/* wait for its completion */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+}
+
+/*
+ *
+ *	SGEMM
+ *
+ */
+
+struct sgemm_args {
+	char transa;
+	char transb;
+	int m, n, k;
+	float alpha;
+	float beta;
+};
+
+
+void _cublas_sgemm(starpu_data_interface_t *descr, void *arg)
+{
+	float *A, *B, *C;
+	uint32_t nxA, nyA, ldA;
+	uint32_t nxB, nyB, ldB;
+	uint32_t nxC, nyC, ldC;
+
+	A = (float *)descr[0].blas.ptr;
+	nxA = descr[0].blas.nx;
+	nyA = descr[0].blas.ny;
+	ldA = descr[0].blas.ld;
+
+	B = (float *)descr[1].blas.ptr;
+	nxB = descr[1].blas.nx;
+	nyB = descr[1].blas.ny;
+	ldB = descr[1].blas.ld;
+
+	C = (float *)descr[2].blas.ptr;
+	nxC = descr[2].blas.nx;
+	nyC = descr[2].blas.ny;
+	ldC = descr[2].blas.ld;
+
+	struct sgemm_args *args = arg;
+
+//	fprintf(stderr, "CUBLAS SGEMM nxA %d nyA %d nxB %d nyB %d nxC %d nyC %d lda %d ldb %d ldc %d\n", nxA, nyA, nxB, nyB, nxC, nyC, ldA, ldB, ldC);
+
+//	STARPU_ASSERT(nxA == nxC);
+//	STARPU_ASSERT(nyA == nxB);
+//	STARPU_ASSERT(nyB == nyC);
+//
+//	STARPU_ASSERT(nxA <= ldA);
+//	STARPU_ASSERT(nxB <= ldB);
+//	STARPU_ASSERT(nxC <= ldC);
+
+	cublasSgemm (args->transa, args->transb, args->m, args->n, args->k, args->alpha, A, (int)ldA,
+			B, (int)ldB, args->beta, C, (int)ldC);
+	cublasStatus st = cublasGetError();
+	if (st) fprintf(stderr, "ERROR %d\n", st);
+	STARPU_ASSERT(st == CUBLAS_STATUS_SUCCESS);
+}
+
+static void _cublas_sgemm_callback(void *sem)
+{
+	sem_t *semptr = sem;
+	sem_post(semptr);
+}
+
+void STARPU_SGEMM (const char *transa, const char *transb, const int m,
+                   const int n, const int k, const float alpha,
+                   const float *A, const int lda, const float *B,
+                   const int ldb, const float beta, float *C, const int ldc)
+{
+	struct sgemm_args args;
+		args.transa = *transa;
+		args.transb = *transb;
+		args.alpha = alpha;
+		args.beta = beta;
+		args.m = m;
+		args.n = n;
+		args.k = k;
+
+	data_state A_state;
+	data_state B_state;
+	data_state C_state;
+
+	starpu_codelet cl;
+	job_t j;
+	sem_t sem;
+
+//	fprintf(stderr, "STARPU - SGEMM - TRANSA %c TRANSB %c m %d n %d k %d lda %d ldb %d ldc %d \n", *transa, *transb, m, n, k, lda, ldb, ldc);
+
+	if (toupper(*transa) == 'N')
+	{
+		starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, lda, m, k, sizeof(float));
+	}
+	else 
+	{
+		starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, lda, k, m, sizeof(float));
+	}
+
+	if (toupper(*transb) == 'N')
+	{
+		starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, ldb, k, n, sizeof(float));
+	}
+	else 
+	{	
+		starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, ldb, n, k, sizeof(float));
+	}
+
+	starpu_monitor_blas_data(&C_state, 0, (uintptr_t)C, ldc, m, n, sizeof(float));
+
+	/* initialize codelet */
+	cl.where = CUBLAS;
+	//cl.core_func = _core_strsm;
+	cl.cublas_func = _cublas_sgemm;
+	
+	j = job_create();
+	j->cb = _cublas_sgemm_callback;
+	j->argcb = &sem;
+	j->cl = &cl;
+	j->cl_arg = &args;
+
+	j->nbuffers = 3;
+	j->buffers[0].state = &A_state;
+	j->buffers[0].mode = R;
+	j->buffers[1].state = &B_state;
+	j->buffers[1].mode = R;
+	j->buffers[2].state = &C_state;
+	j->buffers[2].mode = RW;
+	
+	j->cl->model = NULL;
+
+	sem_init(&sem, 0, 0U);
+	
+	/* submit the codelet */
+	submit_job(j);
+
+	/* wait for its completion */
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	/* make sure data are in memory again */
+	starpu_unpartition_data(&A_state, 0);
+	starpu_unpartition_data(&B_state, 0);
+	starpu_unpartition_data(&C_state, 0);
+	//starpu_delete_data(&A_state);
+	//starpu_delete_data(&B_state);
+	//starpu_delete_data(&C_state);
+	
+//	fprintf(stderr, "SGEMM done\n");
+}
+
+
+/*
+ *
+ *	STRSM
+ *
+ */
+
+struct strsm_args {
+	char side;
+	char uplo;
+	char transa;
+	char diag;
+	float alpha;
+	int m,n;
+};
+//
+//void _core_strsm(starpu_data_interface_t *descr, void *arg)
+//{
+//	float *A, *B;
+//	uint32_t nxA, nyA, ldA;
+//	uint32_t nxB, nyB, ldB;
+//
+//	A = (float *)descr[0].blas.ptr;
+//	nxA = descr[0].blas.nx;
+//	nyA = descr[0].blas.ny;
+//	ldA = descr[0].blas.ld;
+//
+//	B = (float *)descr[1].blas.ptr;
+//	nxB = descr[1].blas.nx;
+//	nyB = descr[1].blas.ny;
+//	ldB = descr[1].blas.ld;
+//
+//	struct strsm_args *args = arg;
+//
+//	fprintf(stderr, "CORE STRSM nxA %d nyA %d nxB %d nyB %d lda %d ldb %d\n", nxA, nyA, nxB, nyB, ldA, ldB);
+//
+//	SOPALIN_TRSM("R","L","T","U",dimb,dima,fun,ga,stride,gb,stride);
+//	
+//}
+
+/* 
+ *	
+ *	
+ *
+ */
+
+
+void CUBLAS_SGEMM (const char *transa, const char *transb, const int m,
+                   const int n, const int k, const float alpha,
+                   const float *A, const int lda, const float *B,
+                   const int ldb, const float beta, float *C, const int ldc)
+{
+    int ka, kb;
+    float *devPtrA, *devPtrB, *devPtrC;
+
+//   printf("CUBLAS SGEMM : m %d n %d k %d lda %d ldb %d ldc %d\n", m, n, k, lda, ldb, ldc);
+
+    /*  A      - REAL             array of DIMENSION ( LDA, ka ), where ka is
+     *           k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
+     *           Before entry with  TRANSA = 'N' or 'n',  the leading  m by k
+     *           part of the array  A  must contain the matrix  A,  otherwise
+     *           the leading  k by m  part of the array  A  must contain  the
+     *           matrix A.
+     */
+    ka = (toupper(transa[0]) == 'N') ? k : m;
+    cublasAlloc (lda * ka, sizeof(devPtrA[0]), (void**)&devPtrA);
+    if (toupper(transa[0]) == 'N') {
+        cublasSetMatrix (STARPU_MIN(m,lda), k, sizeof(A[0]), A, lda, devPtrA, 
+                         lda);
+    } else {
+        cublasSetMatrix (STARPU_MIN(k,lda), m, sizeof(A[0]), A, lda, devPtrA, 
+                         lda);
+    }
+
+    /*  B      - REAL             array of DIMENSION ( LDB, kb ), where kb is
+     *           n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
+     *           Before entry with  TRANSB = 'N' or 'n',  the leading  k by n
+     *           part of the array  B  must contain the matrix  B,  otherwise
+     *           the leading  n by k  part of the array  B  must contain  the
+     *           matrix B.
+     */
+    kb = (toupper(transb[0]) == 'N') ? n : k;
+    cublasAlloc (ldb * kb, sizeof(devPtrB[0]), (void**)&devPtrB);
+    if (toupper(transb[0]) == 'N') {
+        cublasSetMatrix (STARPU_MIN(k,ldb), n, sizeof(B[0]), B, ldb, devPtrB, 
+                         ldb);
+    } else {
+        cublasSetMatrix (STARPU_MIN(n,ldb), k, sizeof(B[0]), B, ldb, devPtrB,
+                         ldb);
+    }
+    
+    /*  C      - REAL             array of DIMENSION ( LDC, n ).
+     *           Before entry, the leading  m by n  part of the array  C must
+     *           contain the matrix  C,  except when  beta  is zero, in which
+     *           case C need not be set on entry.
+     *           On exit, the array  C  is overwritten by the  m by n  matrix
+     */
+    cublasAlloc ((ldc) * (n), sizeof(devPtrC[0]), (void**)&devPtrC);
+    cublasSetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), C, ldc, devPtrC, ldc);
+
+    cublasSgemm (transa[0], transb[0], m, n, k, alpha, devPtrA, lda, 
+                 devPtrB, ldb, beta, devPtrC, ldc);
+
+    cublasGetMatrix (STARPU_MIN(m,ldc), n, sizeof(C[0]), devPtrC, ldc, C, ldc);
+    cublasFree (devPtrA);
+    cublasFree (devPtrB);
+    cublasFree (devPtrC);
+}
+
+

+ 108 - 0
examples/pastix-wrappers/starpu-blas-wrapper.h

@@ -0,0 +1,108 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_BLAS_WRAPPER_H__
+#define __STARPU_BLAS_WRAPPER_H__
+
+#include "generated_model.h"
+
+#define OVERHEAD	150000.0
+
+static double transfer_time_dtoh(unsigned size)
+{
+	double latency = 0.0;
+	double bandwith = 0.0;
+
+	return latency + size*bandwith;
+}
+
+static double transfer_time_htod(unsigned size)
+{
+	double latency = 0.0;
+	double bandwith = 0.0;
+
+	return latency + size*bandwith;
+}
+
+/*GEMM CPU */
+
+#define PERF_GEMM_CPU(i,j,k) (GEMM_CPU_A*(double)(i)*(double)(j)*(double)(k)+GEMM_CPU_B*(double)(i)*(double)(j)+GEMM_CPU_C*(double)(j)*(double)(k)+GEMM_CPU_D*(double)(i)+GEMM_CPU_E*(double)(j)+GEMM_CPU_F)
+
+static double starpu_compute_contrib_compact_core_cost(starpu_buffer_descr *descr)
+{
+	unsigned nx0, ny0, ny2;
+	nx0 = descr[0].state->interface->blas.nx;
+	ny0 = descr[0].state->interface->blas.ny;
+	ny2 = descr[2].state->interface->blas.ny;
+
+	return PERF_GEMM_CPU(nx0-ny0, ny2, ny0); 
+}
+
+
+
+/*GEMM GPU */
+
+#define PERF_GEMM_GPU(i,j,k) (GEMM_GPU_A*(double)(i)*(double)(j)*(double)(k)+GEMM_GPU_B*(double)(i)*(double)(j)+GEMM_GPU_C*(double)(j)*(double)(k)+GEMM_GPU_D*(double)(i)+GEMM_GPU_E*(double)(j)+GEMM_GPU_F)
+
+static double starpu_compute_contrib_compact_cuda_cost(starpu_buffer_descr *descr)
+{
+	unsigned nx0, ny0, ny2;
+	nx0 = descr[0].state->interface->blas.nx;
+	ny0 = descr[0].state->interface->blas.ny;
+	ny2 = descr[2].state->interface->blas.ny;
+
+	return PERF_GEMM_GPU(nx0-ny0, ny2, ny0) + OVERHEAD; 
+}
+
+
+/*TRSM CPU */
+
+#define PERF_TRSM_GPU(i,j)   (TRSM_GPU_A*(double)(i)*(double)(i)*(double)(j)+TRSM_GPU_B*(double)(i)+TRSM_GPU_C)
+
+static double starpu_cblk_strsm_cuda_cost(starpu_buffer_descr *descr)
+{
+	unsigned nx, ny;
+	nx = descr[0].state->interface->blas.nx;
+	ny = descr[0].state->interface->blas.ny;
+
+	return PERF_TRSM_GPU(nx-ny, ny) + OVERHEAD; 
+}
+
+/*TRSM CPU */
+
+#define PERF_TRSM_CPU(i,j)   (TRSM_CPU_A*(double)(i)*(double)(i)*(double)(j)+TRSM_CPU_B*(double)(i)+TRSM_CPU_C)
+
+static double starpu_cblk_strsm_core_cost(starpu_buffer_descr *descr)
+{
+	unsigned nx, ny;
+	nx = descr[0].state->interface->blas.nx;
+	ny = descr[0].state->interface->blas.ny;
+
+	return PERF_TRSM_CPU(nx-ny, ny); 
+}
+
+void STARPU_INIT(void);
+void STARPU_TERMINATE(void);
+void STARPU_SGEMM (const char *transa, const char *transb, const int m,
+                   const int n, const int k, const float alpha,
+                   const float *A, const int lda, const float *B,
+                   const int ldb, const float beta, float *C, const int ldc);
+void STARPU_STRSM (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int m, const int n, 
+                   const float alpha, const float *A, const int lda,
+                   float *B, const int ldb);
+
+#endif // __STARPU_BLAS_WRAPPER_H__

+ 46 - 0
examples/spmv/Makefile.in

@@ -0,0 +1,46 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+ifeq (@USE_CUDA@,yes)
+	EXTRADEP += ../cuda/spmv_cuda.cubin
+endif
+
+all: $(EXTRADEP) dw_spmv dw_block_spmv
+	make -C matrix-market
+
+../cuda/spmv_cuda.cubin:
+	make -C ../cuda spmv_cuda.cubin
+
+dw_spmv.o: dw_spmv.c dw_spmv.h
+	$(CC) $(CFLAGS) dw_spmv.c -c -o dw_spmv.o
+
+dw_spmv: dw_spmv.o
+	$(CC) dw_spmv.o -o dw_spmv $(LDFLAGS) $(LIBS)
+
+dw_block_spmv: dw_block_spmv.o dw_block_spmv_kernels.o
+	make -C matrix-market mm_to_bcsr.o
+	make -C matrix-market mmio.o
+	$(CC) dw_block_spmv.o dw_block_spmv_kernels.o matrix-market/mm_to_bcsr.o matrix-market/mmio.o -o dw_block_spmv $(LDFLAGS) $(LIBS)
+
+clean:
+	@make -C matrix-market clean
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f dw_spmv dw_block_spmv

+ 288 - 0
examples/spmv/dw_block_spmv.c

@@ -0,0 +1,288 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_block_spmv.h"
+#include "matrix-market/mm_to_bcsr.h"
+
+struct timeval start;
+struct timeval end;
+
+sem_t sem;
+
+unsigned c = 256;
+unsigned r = 256;
+
+unsigned remainingtasks = -1;
+
+starpu_data_handle sparse_matrix;
+starpu_data_handle vector_in, vector_out;
+
+uint32_t size;
+char *inputfile;
+bcsr_t *bcsr_matrix;
+
+float *vector_in_ptr;
+float *vector_out_ptr;
+
+void create_data(void)
+{
+	/* read the input file */
+	bcsr_matrix = mm_file_to_bcsr(inputfile, c, r);
+
+	/* declare the corresponding block CSR to the runtime */
+	starpu_monitor_bcsr_data(&sparse_matrix, 0, bcsr_matrix->nnz_blocks, bcsr_matrix->nrows_blocks,
+	                (uintptr_t)bcsr_matrix->val, bcsr_matrix->colind, bcsr_matrix->rowptr, 
+			0, bcsr_matrix->r, bcsr_matrix->c, sizeof(float));
+
+	size = c*r*starpu_get_bcsr_nnz(sparse_matrix);
+//	printf("size = %d \n ", size);
+
+	/* initiate the 2 vectors */
+	vector_in_ptr = malloc(size*sizeof(float));
+	assert(vector_in_ptr);
+
+	vector_out_ptr = malloc(size*sizeof(float));
+	assert(vector_out_ptr);
+
+	/* fill those */
+	unsigned ind;
+	for (ind = 0; ind < size; ind++)
+	{
+		vector_in_ptr[ind] = 2.0f;
+		vector_out_ptr[ind] = 0.0f;
+	}
+
+	starpu_monitor_vector_data(&vector_in, 0, (uintptr_t)vector_in_ptr, size, sizeof(float));
+	starpu_monitor_vector_data(&vector_out, 0, (uintptr_t)vector_out_ptr, size, sizeof(float));
+}
+
+void init_problem_callback(void *arg)
+{
+	unsigned *remaining = arg;
+
+	unsigned val = STARPU_ATOMIC_ADD(remaining, -1);
+
+//	if (val < 10)
+//		printf("callback %d remaining \n", val);
+
+	if ( val == 0 )
+	{
+		printf("DONE ...\n");
+		gettimeofday(&end, NULL);
+
+//		starpu_unpartition_data(sparse_matrix, 0);
+		starpu_unpartition_data(vector_out, 0);
+
+		sem_post(&sem);
+	}
+}
+
+
+void call_filters(void)
+{
+
+	starpu_filter bcsr_f;
+	starpu_filter vector_in_f, vector_out_f;
+
+	bcsr_f.filter_func    = starpu_canonical_block_filter_bcsr;
+
+	vector_in_f.filter_func = starpu_block_filter_func_vector;
+	vector_in_f.filter_arg  = size/c;
+	
+	vector_out_f.filter_func = starpu_block_filter_func_vector;
+	vector_out_f.filter_arg  = size/r;
+
+	starpu_partition_data(sparse_matrix, &bcsr_f);
+
+	starpu_partition_data(vector_in, &vector_in_f);
+	starpu_partition_data(vector_out, &vector_out_f);
+}
+
+#define NSPMV	32
+unsigned totaltasks;
+
+starpu_codelet cl = {
+	.where = CORE|CUBLAS,
+	.core_func =  core_block_spmv,
+#ifdef USE_CUDA
+	.cublas_func = cublas_block_spmv,
+#endif
+	.nbuffers = 3
+};
+
+void launch_spmv_codelets(void)
+{
+	struct starpu_task *task_tab;
+	uint8_t *is_entry_tab;
+
+	/* we call one codelet per block */
+	unsigned nblocks = starpu_get_bcsr_nnz(sparse_matrix); 
+	unsigned nrows = starpu_get_bcsr_nrow(sparse_matrix); 
+
+	remainingtasks = NSPMV*nblocks;
+	totaltasks = remainingtasks;
+
+	unsigned taskid = 0;
+
+	task_tab = malloc(totaltasks*sizeof(struct starpu_task));
+	STARPU_ASSERT(task_tab);
+
+	is_entry_tab = calloc(totaltasks, sizeof(uint8_t));
+	STARPU_ASSERT(is_entry_tab);
+
+	printf("there will be %d codelets\n", remainingtasks);
+
+	uint32_t *rowptr = starpu_get_bcsr_local_rowptr(sparse_matrix);
+	uint32_t *colind = starpu_get_bcsr_local_colind(sparse_matrix);
+
+	gettimeofday(&start, NULL);
+
+	unsigned loop;
+	for (loop = 0; loop < NSPMV; loop++)
+	{
+		unsigned row;
+		unsigned part = 0;
+
+		for (row = 0; row < nrows; row++)
+		{
+			unsigned index;
+
+			if (rowptr[row] == rowptr[row+1])
+			{
+				continue;
+			}
+
+
+			for (index = rowptr[row]; index < rowptr[row+1]; index++, part++)
+			{
+				struct starpu_task *task = &task_tab[taskid];
+
+				task->use_tag = 1;
+				task->tag_id = taskid;
+
+				task->callback_func = init_problem_callback;
+				task->callback_arg = &remainingtasks;
+				task->cl = &cl;
+				task->cl_arg = NULL;
+
+				unsigned i = colind[index];
+				unsigned j = row;
+		
+				task->buffers[0].state = get_sub_data(sparse_matrix, 1, part);
+				task->buffers[0].mode  = R;
+				task->buffers[1].state = get_sub_data(vector_in, 1, i);
+				task->buffers[1].mode = R;
+				task->buffers[2].state = get_sub_data(vector_out, 1, j);
+				task->buffers[2].mode = RW;
+
+				/* all tasks in the same row are dependant so that we don't wait too much for data 
+				 * we need to wait on the previous task if we are not the first task of a row */
+				if (index != rowptr[row & ~0x3])
+				{
+					/* this is not the first task in the row */
+					starpu_tag_declare_deps(taskid, 1, taskid-1);
+
+					is_entry_tab[taskid] = 0;
+				}
+				else {
+					/* this is an entry task */
+					is_entry_tab[taskid] = 1;
+				}
+
+				taskid++;
+			}
+		}
+	}
+
+	printf("start submitting tasks !\n");
+
+	/* submit ALL tasks now */
+	unsigned nchains = 0;
+	unsigned task;
+	for (task = 0; task < totaltasks; task++)
+	{
+		if (is_entry_tab[task]) {
+			nchains++;
+		}
+
+		starpu_submit_task(&task_tab[task]);
+	}
+
+	printf("end of task submission (there was %d chains for %d tasks : ratio %d tasks per chain) !\n", nchains, totaltasks, totaltasks/nchains);
+}
+
+void init_problem(void)
+{
+	/* create the sparse input matrix */
+	create_data();
+
+	/* create a new codelet that will perform a SpMV on it */
+	call_filters();
+}
+
+void print_results(void)
+{
+	unsigned row;
+
+	for (row = 0; row < STARPU_MIN(size, 16); row++)
+	{
+		printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
+	}
+}
+
+int main(__attribute__ ((unused)) int argc,
+	__attribute__ ((unused)) char **argv)
+{
+	if (argc < 2)
+	{
+		fprintf(stderr, "usage : %s filename [tile size]\n", argv[0]);
+		exit(-1);
+	}
+
+	if (argc == 3)
+	{
+		/* third argument is the tile size */
+		char *argptr;
+		r = strtol(argv[2], &argptr, 10);
+		c = r;
+	}
+
+	inputfile = argv[1];
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+	init_problem();
+
+	launch_spmv_codelets();
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	print_results();
+
+	double totalflop = 2.0*c*r*totaltasks;
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "Flop %e\n", totalflop);
+	fprintf(stderr, "GFlops : %2.2f\n", totalflop/timing/1000);
+
+	return 0;
+}

+ 41 - 0
examples/spmv/dw_block_spmv.h

@@ -0,0 +1,41 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_BLOCK_SPMV_H__
+#define __DW_BLOCK_SPMV_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <cblas.h>
+
+#include <starpu.h>
+
+#ifdef USE_CUDA
+#include <cublas.h>
+#endif
+
+void core_block_spmv(starpu_data_interface_t *descr, void *_args);
+
+#ifdef USE_CUDA
+void cublas_block_spmv(starpu_data_interface_t *descr, void *_args);
+#endif // USE_CUDA
+
+#endif // __DW_BLOCK_SPMV_H__

+ 64 - 0
examples/spmv/dw_block_spmv_kernels.c

@@ -0,0 +1,64 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_block_spmv.h"
+
+/*
+ *   U22 
+ */
+
+static inline void common_block_spmv(starpu_data_interface_t *buffers, int s, __attribute__((unused)) void *_args)
+{
+	//printf("22\n");
+	float *block 	= (float *)buffers[0].blas.ptr;
+	float *in 	= (float *)buffers[1].vector.ptr;
+	float *out 	= (float *)buffers[2].vector.ptr;
+
+	unsigned dx = buffers[0].blas.nx;
+	unsigned dy = buffers[0].blas.ny;
+
+	unsigned ld = buffers[0].blas.ld;
+
+	switch (s) {
+		case 0:
+			cblas_sgemv(CblasRowMajor, CblasNoTrans, dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasSgemv ('t', dx, dy, 1.0f, block, ld, in, 1, 1.0f, out, 1);
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void core_block_spmv(starpu_data_interface_t *descr, void *_args)
+{
+//	printf("CORE CODELET \n");
+
+	common_block_spmv(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void cublas_block_spmv(starpu_data_interface_t *descr, void *_args)
+{
+//	printf("CUBLAS CODELET \n");
+
+	common_block_spmv(descr, 1, _args);
+}
+#endif// USE_CUDA

+ 349 - 0
examples/spmv/dw_spmv.c

@@ -0,0 +1,349 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * Conjugate gradients for Sparse matrices
+ */
+
+#include "dw_spmv.h"
+
+struct timeval start;
+struct timeval end;
+
+unsigned nblocks = 1;
+unsigned remainingtasks = -1;
+
+/* First a Matrix-Vector product (SpMV) */
+
+unsigned blocks = 512;
+unsigned grids  = 8;
+
+#ifdef USE_CUDA
+/* CUDA spmv codelet */
+static struct starpu_cuda_module_s cuda_module;
+static struct starpu_cuda_function_s cuda_function;
+static starpu_cuda_codelet_t cuda_spmv;
+
+void initialize_cuda(void)
+{
+	char module_path[1024];
+	sprintf(module_path,
+		"%s/examples/cuda/spmv_cuda.cubin", STARPUDIR);
+	char *function_symbol = "spmv_kernel_3";
+
+	starpu_init_cuda_module(&cuda_module, module_path);
+	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
+
+	cuda_spmv.func = &cuda_function;
+	cuda_spmv.stack = NULL;
+	cuda_spmv.stack_size = 0; 
+
+	cuda_spmv.gridx = grids;
+	cuda_spmv.gridy = 1;
+
+	cuda_spmv.blockx = blocks;
+	cuda_spmv.blocky = 1;
+
+	cuda_spmv.shmemsize = 60;
+}
+
+
+
+
+#endif // USE_CUDA
+
+
+sem_t sem;
+uint32_t size = 4194304;
+
+starpu_data_handle sparse_matrix;
+starpu_data_handle vector_in, vector_out;
+
+float *sparse_matrix_nzval;
+uint32_t *sparse_matrix_colind;
+uint32_t *sparse_matrix_rowptr;
+
+float *vector_in_ptr;
+float *vector_out_ptr;
+
+unsigned usecpu = 0;
+
+
+void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-block") == 0) {
+			char *argptr;
+			blocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-grid") == 0) {
+			char *argptr;
+			grids = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+
+		if (strcmp(argv[i], "-cpu") == 0) {
+			usecpu = 1;
+		}
+	}
+}
+
+void core_spmv(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	float *nzval = (float *)descr[0].csr.nzval;
+	uint32_t *colind = descr[0].csr.colind;
+	uint32_t *rowptr = descr[0].csr.rowptr;
+
+	float *vecin = (float *)descr[1].vector.ptr;
+	float *vecout = (float *)descr[2].vector.ptr;
+
+	uint32_t firstelem = descr[0].csr.firstentry;
+
+	uint32_t nnz;
+	uint32_t nrow;
+
+	nnz = descr[0].csr.nnz;
+	nrow = descr[0].csr.nrow;
+
+	//STARPU_ASSERT(nrow == descr[1].vector.nx);
+	STARPU_ASSERT(nrow == descr[2].vector.nx);
+
+	unsigned row;
+	for (row = 0; row < nrow; row++)
+	{
+		float tmp = 0.0f;
+		unsigned index;
+
+		unsigned firstindex = rowptr[row] - firstelem;
+		unsigned lastindex = rowptr[row+1] - firstelem;
+
+		for (index = firstindex; index < lastindex; index++)
+		{
+			unsigned col;
+
+			col = colind[index];
+			tmp += nzval[index]*vecin[col];
+		}
+
+		vecout[row] = tmp;
+	}
+
+}
+
+void create_data(void)
+{
+	/* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
+	
+	/* example of 3-band matrix */
+	float *nzval;
+	uint32_t nnz;
+	uint32_t *colind;
+	uint32_t *rowptr;
+
+	nnz = 3*size-2;
+
+	nzval = malloc(nnz*sizeof(float));
+	colind = malloc(nnz*sizeof(uint32_t));
+	rowptr = malloc((size+1)*sizeof(uint32_t));
+
+	assert(nzval);
+	assert(colind);
+	assert(rowptr);
+
+	/* fill the matrix */
+	unsigned row;
+	unsigned pos = 0;
+	for (row = 0; row < size; row++)
+	{
+		rowptr[row] = pos;
+
+		if (row > 0) {
+			nzval[pos] = 1.0f;
+			colind[pos] = row-1;
+			pos++;
+		}
+		
+		nzval[pos] = 5.0f;
+		colind[pos] = row;
+		pos++;
+
+		if (row < size - 1) {
+			nzval[pos] = 1.0f;
+			colind[pos] = row+1;
+			pos++;
+		}
+	}
+
+	STARPU_ASSERT(pos == nnz);
+
+	rowptr[size] = nnz;
+	
+	starpu_monitor_csr_data(&sparse_matrix, 0, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
+
+	sparse_matrix_nzval = nzval;
+	sparse_matrix_colind = colind;
+	sparse_matrix_rowptr = rowptr;
+
+	/* initiate the 2 vectors */
+	float *invec, *outvec;
+	invec = malloc(size*sizeof(float));
+	assert(invec);
+
+	outvec = malloc(size*sizeof(float));
+	assert(outvec);
+
+	/* fill those */
+	unsigned ind;
+	for (ind = 0; ind < size; ind++)
+	{
+		invec[ind] = 2.0f;
+		outvec[ind] = 0.0f;
+	}
+
+	starpu_monitor_vector_data(&vector_in, 0, (uintptr_t)invec, size, sizeof(float));
+	starpu_monitor_vector_data(&vector_out, 0, (uintptr_t)outvec, size, sizeof(float));
+
+	vector_in_ptr = invec;
+	vector_out_ptr = outvec;
+
+}
+
+void init_problem_callback(void *arg)
+{
+	unsigned *remaining = arg;
+
+
+	unsigned val = STARPU_ATOMIC_ADD(remaining, -1);
+
+	printf("callback %d remaining \n", val);
+	if ( val == 0 )
+	{
+		printf("DONE ...\n");
+		gettimeofday(&end, NULL);
+
+		starpu_unpartition_data(sparse_matrix, 0);
+		starpu_unpartition_data(vector_out, 0);
+
+		sem_post(&sem);
+	}
+}
+
+
+void call_spmv_codelet_filters(void)
+{
+
+	remainingtasks = nblocks;
+
+	starpu_codelet *cl = malloc(sizeof(starpu_codelet));
+
+	/* partition the data along a block distribution */
+	starpu_filter csr_f, vector_f;
+	csr_f.filter_func    = starpu_vertical_block_filter_func_csr;
+	csr_f.filter_arg     = nblocks;
+	vector_f.filter_func = starpu_block_filter_func_vector;
+	vector_f.filter_arg  = nblocks;
+
+	starpu_partition_data(sparse_matrix, &csr_f);
+	starpu_partition_data(vector_out, &vector_f);
+
+	cl->where = CORE|CUDA;
+	cl->core_func =  core_spmv;
+#ifdef USE_CUDA
+	cl->cuda_func = &cuda_spmv;
+#endif
+	cl->nbuffers = 3;
+
+	gettimeofday(&start, NULL);
+
+	unsigned part;
+	for (part = 0; part < nblocks; part++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->callback_func = init_problem_callback;
+		task->callback_arg = &remainingtasks;
+		task->cl = cl;
+		task->cl_arg = NULL;
+	
+		task->buffers[0].state = get_sub_data(sparse_matrix, 1, part);
+		task->buffers[0].mode  = R;
+		task->buffers[1].state = vector_in;
+		task->buffers[1].mode = R;
+		task->buffers[2].state = get_sub_data(vector_out, 1, part);
+		task->buffers[2].mode = W;
+	
+		starpu_submit_task(task);
+	}
+}
+
+void init_problem(void)
+{
+	/* create the sparse input matrix */
+	create_data();
+
+	/* create a new codelet that will perform a SpMV on it */
+	call_spmv_codelet_filters();
+}
+
+void print_results(void)
+{
+	unsigned row;
+
+	for (row = 0; row < STARPU_MIN(size, 16); row++)
+	{
+		printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
+	}
+}
+
+int main(__attribute__ ((unused)) int argc,
+	__attribute__ ((unused)) char **argv)
+{
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+#ifdef USE_CUDA
+	initialize_cuda();
+#endif
+
+	init_problem();
+
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	print_results();
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	return 0;
+}

+ 31 - 0
examples/spmv/dw_spmv.h

@@ -0,0 +1,31 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DW_SPARSE_CG_H__
+#define __DW_SPARSE_CG_H__
+
+#include <semaphore.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <cblas.h>
+
+#include <starpu.h>
+
+#endif // __DW_SPARSE_CG_H__

+ 20 - 0
examples/spmv/matrix-market/example_read.c

@@ -0,0 +1,20 @@
+#include "mm_to_bcsr.h"
+
+int main(int argc, char *argv[])
+{
+	unsigned c, r;
+
+	if (argc < 2)
+	{
+		fprintf(stderr, "Usage: %s [martix-market-filename] [c] [r]\n", argv[0]);
+		exit(1);
+	}
+
+	c = 64;
+	r = 64;
+
+	bcsr_t *bcsr;
+	bcsr = mm_file_to_bcsr(argv[1], c, r);
+
+	return 0;
+}

+ 522 - 0
examples/spmv/matrix-market/examples/fidapm05.mtx

@@ -0,0 +1,522 @@
+%%MatrixMarket matrix coordinate real general
+42 42 520
+1 1  1.9555555555555e+00
+2 1 -1.9999999999999e-01
+10 1 -1.0666666666667e+00
+11 1 -3.5555555555556e-01
+19 1  1.3322676295502e-15
+20 1  1.1111111111111e-01
+25 1  1.3333333333333e-01
+26 1  4.4444444444444e-03
+27 1  1.3333333333333e-02
+1 2 -1.9999999999999e-01
+2 2  1.2444444444444e+00
+3 2 -2.0000000000001e-01
+4 2 -3.3333333333334e-02
+10 2 -3.5555555555555e-01
+11 2 -4.0000000000000e-01
+12 2 -3.5555555555555e-01
+13 2  1.1111111111111e-01
+19 2  1.1111111111111e-01
+20 2 -6.6666666666669e-02
+21 2  1.1111111111111e-01
+22 2 -2.2222222222222e-02
+25 2  3.3333333333333e-02
+26 2  1.1111111111111e-03
+27 2  6.6666666666666e-03
+28 2  3.3333333333334e-02
+29 2  1.1111111111111e-03
+30 2  6.6666666666669e-03
+2 3 -2.0000000000001e-01
+3 3  1.9555555555555e+00
+4 3 -1.9999999999999e-01
+11 3 -3.5555555555555e-01
+12 3 -1.0666666666667e+00
+13 3 -3.5555555555556e-01
+20 3  1.1111111111111e-01
+21 3  3.5527136788005e-15
+22 3  1.1111111111111e-01
+28 3  1.3333333333333e-01
+29 3  4.4444444444444e-03
+30 3  4.0000000000000e-02
+2 4 -3.3333333333334e-02
+3 4 -1.9999999999999e-01
+4 4  1.2444444444444e+00
+5 4 -2.0000000000001e-01
+6 4 -3.3333333333337e-02
+11 4  1.1111111111111e-01
+12 4 -3.5555555555554e-01
+13 4 -4.0000000000000e-01
+14 4 -3.5555555555555e-01
+15 4  1.1111111111112e-01
+20 4 -2.2222222222222e-02
+21 4  1.1111111111111e-01
+22 4 -6.6666666666664e-02
+23 4  1.1111111111111e-01
+24 4 -2.2222222222223e-02
+28 4  3.3333333333331e-02
+29 4  1.1111111111111e-03
+30 4  1.3333333333333e-02
+31 4  3.3333333333334e-02
+32 4  1.1111111111111e-03
+33 4  1.3333333333334e-02
+4 5 -2.0000000000001e-01
+5 5  1.9555555555555e+00
+6 5 -1.9999999999997e-01
+13 5 -3.5555555555555e-01
+14 5 -1.0666666666667e+00
+15 5 -3.5555555555557e-01
+22 5  1.1111111111111e-01
+23 5  7.5495165674511e-15
+24 5  1.1111111111111e-01
+31 5  1.3333333333333e-01
+32 5  4.4444444444444e-03
+33 5  6.6666666666666e-02
+4 6 -3.3333333333337e-02
+5 6 -1.9999999999997e-01
+6 6  1.2444444444445e+00
+7 6 -2.0000000000003e-01
+8 6 -3.3333333333331e-02
+13 6  1.1111111111111e-01
+14 6 -3.5555555555555e-01
+15 6 -4.0000000000001e-01
+16 6 -3.5555555555554e-01
+17 6  1.1111111111111e-01
+22 6 -2.2222222222223e-02
+23 6  1.1111111111111e-01
+24 6 -6.6666666666670e-02
+31 6  3.3333333333334e-02
+32 6  1.1111111111111e-03
+33 6  2.0000000000000e-02
+34 6  3.3333333333332e-02
+35 6  1.1111111111110e-03
+36 6  1.9999999999999e-02
+40 6  1.1111111111111e-01
+41 6 -2.2222222222222e-02
+6 7 -2.0000000000003e-01
+7 7  1.9555555555555e+00
+8 7 -1.9999999999998e-01
+15 7 -3.5555555555554e-01
+16 7 -1.0666666666666e+00
+17 7 -3.5555555555554e-01
+24 7  1.1111111111110e-01
+34 7  1.3333333333333e-01
+35 7  4.4444444444444e-03
+36 7  9.3333333333331e-02
+40 7 -6.6613381477509e-15
+41 7  1.1111111111112e-01
+6 8 -3.3333333333331e-02
+7 8 -1.9999999999998e-01
+8 8  1.2444444444445e+00
+9 8 -2.0000000000005e-01
+15 8  1.1111111111111e-01
+16 8 -3.5555555555558e-01
+17 8 -3.9999999999998e-01
+18 8 -3.5555555555556e-01
+24 8 -2.2222222222222e-02
+34 8  3.3333333333333e-02
+35 8  1.1111111111112e-03
+36 8  2.6666666666666e-02
+37 8  3.3333333333334e-02
+38 8  1.1111111111111e-03
+39 8  2.6666666666667e-02
+40 8  1.1111111111111e-01
+41 8 -6.6666666666668e-02
+42 8  1.1111111111111e-01
+8 9 -2.0000000000005e-01
+9 9  1.9555555555556e+00
+17 9 -3.5555555555552e-01
+18 9 -1.0666666666667e+00
+37 9  1.3333333333333e-01
+38 9  4.4444444444443e-03
+39 9  1.2000000000000e-01
+41 9  1.1111111111111e-01
+42 9  3.1086244689504e-15
+1 10 -1.0666666666667e+00
+2 10 -3.5555555555555e-01
+10 10  5.6888888888889e+00
+11 10 -1.0666666666667e+00
+19 10 -1.0666666666667e+00
+20 10 -3.5555555555555e-01
+25 10  1.1102230246252e-16
+26 10  1.7777777777778e-02
+27 10  1.3877787807814e-17
+1 11 -3.5555555555556e-01
+2 11 -4.0000000000000e-01
+3 11 -3.5555555555555e-01
+4 11  1.1111111111111e-01
+10 11 -1.0666666666667e+00
+11 11  3.9111111111110e+00
+12 11 -1.0666666666666e+00
+13 11 -5.3290705182007e-15
+19 11 -3.5555555555555e-01
+20 11 -3.9999999999999e-01
+21 11 -3.5555555555555e-01
+22 11  1.1111111111111e-01
+26 11  4.4444444444443e-03
+27 11  5.5511151231258e-17
+28 11  5.5511151231258e-17
+29 11  4.4444444444446e-03
+30 11 -3.4694469519536e-17
+2 12 -3.5555555555555e-01
+3 12 -1.0666666666667e+00
+4 12 -3.5555555555554e-01
+11 12 -1.0666666666666e+00
+12 12  5.6888888888888e+00
+13 12 -1.0666666666667e+00
+20 12 -3.5555555555555e-01
+21 12 -1.0666666666667e+00
+22 12 -3.5555555555555e-01
+28 12  2.2204460492503e-16
+29 12  1.7777777777778e-02
+30 12  1.9428902930940e-16
+2 13  1.1111111111111e-01
+3 13 -3.5555555555556e-01
+4 13 -4.0000000000000e-01
+5 13 -3.5555555555555e-01
+6 13  1.1111111111111e-01
+11 13 -5.3290705182007e-15
+12 13 -1.0666666666667e+00
+13 13  3.9111111111111e+00
+14 13 -1.0666666666667e+00
+15 13 -2.6645352591004e-15
+20 13  1.1111111111111e-01
+21 13 -3.5555555555556e-01
+22 13 -3.9999999999999e-01
+23 13 -3.5555555555555e-01
+24 13  1.1111111111111e-01
+28 13  2.2204460492503e-16
+29 13  4.4444444444443e-03
+30 13  1.6653345369377e-16
+31 13  1.1102230246252e-16
+32 13  4.4444444444446e-03
+33 13  6.9388939039072e-17
+4 14 -3.5555555555555e-01
+5 14 -1.0666666666667e+00
+6 14 -3.5555555555555e-01
+13 14 -1.0666666666667e+00
+14 14  5.6888888888888e+00
+15 14 -1.0666666666666e+00
+22 14 -3.5555555555555e-01
+23 14 -1.0666666666667e+00
+24 14 -3.5555555555554e-01
+31 14 -7.7715611723761e-16
+32 14  1.7777777777778e-02
+33 14 -1.6653345369377e-16
+4 15  1.1111111111112e-01
+5 15 -3.5555555555557e-01
+6 15 -4.0000000000001e-01
+7 15 -3.5555555555554e-01
+8 15  1.1111111111111e-01
+13 15 -2.6645352591004e-15
+14 15 -1.0666666666666e+00
+15 15  3.9111111111110e+00
+16 15 -1.0666666666667e+00
+22 15  1.1111111111112e-01
+23 15 -3.5555555555557e-01
+24 15 -3.9999999999999e-01
+31 15  4.4408920985006e-16
+32 15  4.4444444444444e-03
+34 15 -4.7184478546569e-16
+35 15  4.4444444444444e-03
+36 15 -3.1918911957973e-16
+40 15 -3.5555555555555e-01
+41 15  1.1111111111111e-01
+6 16 -3.5555555555554e-01
+7 16 -1.0666666666666e+00
+8 16 -3.5555555555558e-01
+15 16 -1.0666666666667e+00
+16 16  5.6888888888888e+00
+17 16 -1.0666666666667e+00
+24 16 -3.5555555555553e-01
+35 16  1.7777777777777e-02
+36 16  3.8857805861880e-16
+40 16 -1.0666666666666e+00
+41 16 -3.5555555555558e-01
+6 17  1.1111111111111e-01
+7 17 -3.5555555555554e-01
+8 17 -3.9999999999998e-01
+9 17 -3.5555555555552e-01
+16 17 -1.0666666666667e+00
+17 17  3.9111111111111e+00
+18 17 -1.0666666666667e+00
+24 17  1.1111111111111e-01
+34 17 -2.2204460492503e-16
+35 17  4.4444444444441e-03
+36 17 -2.2204460492503e-16
+37 17 -1.3877787807815e-16
+38 17  4.4444444444448e-03
+39 17 -2.7755575615629e-16
+40 17 -3.5555555555553e-01
+41 17 -3.9999999999997e-01
+42 17 -3.5555555555551e-01
+8 18 -3.5555555555556e-01
+9 18 -1.0666666666667e+00
+17 18 -1.0666666666667e+00
+18 18  5.6888888888888e+00
+37 18 -6.6613381477509e-16
+38 18  1.7777777777778e-02
+39 18 -3.3306690738755e-16
+41 18 -3.5555555555555e-01
+42 18 -1.0666666666667e+00
+1 19  1.3322676295502e-15
+2 19  1.1111111111111e-01
+10 19 -1.0666666666667e+00
+11 19 -3.5555555555555e-01
+19 19  1.9555555555556e+00
+20 19 -2.0000000000000e-01
+25 19 -1.3333333333333e-01
+26 19 -2.2222222222222e-02
+27 19 -1.3333333333333e-02
+1 20  1.1111111111111e-01
+2 20 -6.6666666666669e-02
+3 20  1.1111111111111e-01
+4 20 -2.2222222222222e-02
+10 20 -3.5555555555555e-01
+11 20 -3.9999999999999e-01
+12 20 -3.5555555555555e-01
+13 20  1.1111111111111e-01
+19 20 -2.0000000000000e-01
+20 20  1.2444444444444e+00
+21 20 -2.0000000000001e-01
+22 20 -3.3333333333331e-02
+25 20 -3.3333333333334e-02
+26 20 -5.5555555555556e-03
+27 20 -6.6666666666667e-03
+28 20 -3.3333333333334e-02
+29 20 -5.5555555555557e-03
+30 20 -6.6666666666669e-03
+2 21  1.1111111111111e-01
+3 21  3.5527136788005e-15
+4 21  1.1111111111111e-01
+11 21 -3.5555555555555e-01
+12 21 -1.0666666666667e+00
+13 21 -3.5555555555556e-01
+20 21 -2.0000000000001e-01
+21 21  1.9555555555556e+00
+22 21 -2.0000000000000e-01
+28 21 -1.3333333333333e-01
+29 21 -2.2222222222222e-02
+30 21 -4.0000000000000e-02
+2 22 -2.2222222222222e-02
+3 22  1.1111111111111e-01
+4 22 -6.6666666666664e-02
+5 22  1.1111111111111e-01
+6 22 -2.2222222222223e-02
+11 22  1.1111111111111e-01
+12 22 -3.5555555555555e-01
+13 22 -3.9999999999999e-01
+14 22 -3.5555555555555e-01
+15 22  1.1111111111112e-01
+20 22 -3.3333333333331e-02
+21 22 -2.0000000000000e-01
+22 22  1.2444444444444e+00
+23 22 -2.0000000000001e-01
+24 22 -3.3333333333335e-02
+28 22 -3.3333333333332e-02
+29 22 -5.5555555555553e-03
+30 22 -1.3333333333333e-02
+31 22 -3.3333333333334e-02
+32 22 -5.5555555555556e-03
+33 22 -1.3333333333333e-02
+4 23  1.1111111111111e-01
+5 23  7.5495165674511e-15
+6 23  1.1111111111111e-01
+13 23 -3.5555555555555e-01
+14 23 -1.0666666666667e+00
+15 23 -3.5555555555557e-01
+22 23 -2.0000000000001e-01
+23 23  1.9555555555555e+00
+24 23 -1.9999999999998e-01
+31 23 -1.3333333333333e-01
+32 23 -2.2222222222222e-02
+33 23 -6.6666666666667e-02
+4 24 -2.2222222222223e-02
+5 24  1.1111111111111e-01
+6 24 -6.6666666666670e-02
+7 24  1.1111111111110e-01
+8 24 -2.2222222222222e-02
+13 24  1.1111111111111e-01
+14 24 -3.5555555555554e-01
+15 24 -3.9999999999999e-01
+16 24 -3.5555555555553e-01
+17 24  1.1111111111111e-01
+22 24 -3.3333333333335e-02
+23 24 -1.9999999999998e-01
+24 24  1.2444444444444e+00
+31 24 -3.3333333333334e-02
+32 24 -5.5555555555556e-03
+33 24 -2.0000000000000e-02
+34 24 -3.3333333333331e-02
+35 24 -5.5555555555552e-03
+36 24 -1.9999999999999e-02
+40 24 -2.0000000000004e-01
+41 24 -3.3333333333329e-02
+1 25  1.3333333333333e-01
+2 25  3.3333333333333e-02
+10 25  1.1102230246252e-16
+19 25 -1.3333333333333e-01
+20 25 -3.3333333333334e-02
+25 25  0.0000000000000e+00
+1 26  4.4444444444444e-03
+2 26  1.1111111111111e-03
+10 26  1.7777777777778e-02
+11 26  4.4444444444443e-03
+19 26 -2.2222222222222e-02
+20 26 -5.5555555555556e-03
+26 26  0.0000000000000e+00
+1 27  1.3333333333333e-02
+2 27  6.6666666666666e-03
+10 27  1.3877787807814e-17
+11 27  5.5511151231258e-17
+19 27 -1.3333333333333e-02
+20 27 -6.6666666666667e-03
+27 27  0.0000000000000e+00
+2 28  3.3333333333334e-02
+3 28  1.3333333333333e-01
+4 28  3.3333333333331e-02
+11 28  5.5511151231258e-17
+12 28  2.2204460492503e-16
+13 28  2.2204460492503e-16
+20 28 -3.3333333333334e-02
+21 28 -1.3333333333333e-01
+22 28 -3.3333333333332e-02
+28 28  0.0000000000000e+00
+2 29  1.1111111111111e-03
+3 29  4.4444444444444e-03
+4 29  1.1111111111111e-03
+11 29  4.4444444444446e-03
+12 29  1.7777777777778e-02
+13 29  4.4444444444443e-03
+20 29 -5.5555555555557e-03
+21 29 -2.2222222222222e-02
+22 29 -5.5555555555553e-03
+29 29  0.0000000000000e+00
+2 30  6.6666666666669e-03
+3 30  4.0000000000000e-02
+4 30  1.3333333333333e-02
+11 30 -3.4694469519536e-17
+12 30  1.9428902930940e-16
+13 30  1.6653345369377e-16
+20 30 -6.6666666666669e-03
+21 30 -4.0000000000000e-02
+22 30 -1.3333333333333e-02
+30 30  0.0000000000000e+00
+4 31  3.3333333333334e-02
+5 31  1.3333333333333e-01
+6 31  3.3333333333334e-02
+13 31  1.1102230246252e-16
+14 31 -7.7715611723761e-16
+15 31  4.4408920985006e-16
+22 31 -3.3333333333334e-02
+23 31 -1.3333333333333e-01
+24 31 -3.3333333333334e-02
+31 31  0.0000000000000e+00
+4 32  1.1111111111111e-03
+5 32  4.4444444444444e-03
+6 32  1.1111111111111e-03
+13 32  4.4444444444446e-03
+14 32  1.7777777777778e-02
+15 32  4.4444444444444e-03
+22 32 -5.5555555555556e-03
+23 32 -2.2222222222222e-02
+24 32 -5.5555555555556e-03
+32 32  0.0000000000000e+00
+4 33  1.3333333333334e-02
+5 33  6.6666666666666e-02
+6 33  2.0000000000000e-02
+13 33  6.9388939039072e-17
+14 33 -1.6653345369377e-16
+22 33 -1.3333333333333e-02
+23 33 -6.6666666666667e-02
+24 33 -2.0000000000000e-02
+33 33  0.0000000000000e+00
+6 34  3.3333333333332e-02
+7 34  1.3333333333333e-01
+8 34  3.3333333333333e-02
+15 34 -4.7184478546569e-16
+17 34 -2.2204460492503e-16
+24 34 -3.3333333333331e-02
+34 34  0.0000000000000e+00
+40 34 -1.3333333333333e-01
+41 34 -3.3333333333334e-02
+6 35  1.1111111111110e-03
+7 35  4.4444444444444e-03
+8 35  1.1111111111112e-03
+15 35  4.4444444444444e-03
+16 35  1.7777777777777e-02
+17 35  4.4444444444441e-03
+24 35 -5.5555555555552e-03
+35 35  0.0000000000000e+00
+40 35 -2.2222222222222e-02
+41 35 -5.5555555555555e-03
+6 36  1.9999999999999e-02
+7 36  9.3333333333331e-02
+8 36  2.6666666666666e-02
+15 36 -3.1918911957973e-16
+16 36  3.8857805861880e-16
+17 36 -2.2204460492503e-16
+24 36 -1.9999999999999e-02
+36 36  0.0000000000000e+00
+40 36 -9.3333333333331e-02
+41 36 -2.6666666666667e-02
+8 37  3.3333333333334e-02
+9 37  1.3333333333333e-01
+17 37 -1.3877787807815e-16
+18 37 -6.6613381477509e-16
+37 37  0.0000000000000e+00
+41 37 -3.3333333333334e-02
+42 37 -1.3333333333333e-01
+8 38  1.1111111111111e-03
+9 38  4.4444444444443e-03
+17 38  4.4444444444448e-03
+18 38  1.7777777777778e-02
+38 38  0.0000000000000e+00
+41 38 -5.5555555555556e-03
+42 38 -2.2222222222222e-02
+8 39  2.6666666666667e-02
+9 39  1.2000000000000e-01
+17 39 -2.7755575615629e-16
+18 39 -3.3306690738755e-16
+39 39  0.0000000000000e+00
+41 39 -2.6666666666667e-02
+42 39 -1.2000000000000e-01
+6 40  1.1111111111111e-01
+7 40 -6.6613381477509e-15
+8 40  1.1111111111111e-01
+15 40 -3.5555555555555e-01
+16 40 -1.0666666666666e+00
+17 40 -3.5555555555553e-01
+24 40 -2.0000000000004e-01
+34 40 -1.3333333333333e-01
+35 40 -2.2222222222222e-02
+36 40 -9.3333333333331e-02
+40 40  1.9555555555555e+00
+41 40 -1.9999999999998e-01
+6 41 -2.2222222222222e-02
+7 41  1.1111111111112e-01
+8 41 -6.6666666666668e-02
+9 41  1.1111111111111e-01
+15 41  1.1111111111111e-01
+16 41 -3.5555555555558e-01
+17 41 -3.9999999999997e-01
+18 41 -3.5555555555555e-01
+24 41 -3.3333333333329e-02
+34 41 -3.3333333333334e-02
+35 41 -5.5555555555555e-03
+36 41 -2.6666666666667e-02
+37 41 -3.3333333333334e-02
+38 41 -5.5555555555556e-03
+39 41 -2.6666666666667e-02
+40 41 -1.9999999999998e-01
+41 41  1.2444444444445e+00
+42 41 -2.0000000000005e-01
+8 42  1.1111111111111e-01
+9 42  3.1086244689504e-15
+17 42 -3.5555555555551e-01
+18 42 -1.0666666666667e+00
+37 42 -1.3333333333333e-01
+38 42 -2.2222222222222e-02
+39 42 -1.2000000000000e-01
+41 42 -2.0000000000005e-01
+42 42  1.9555555555556e+00

+ 346 - 0
examples/spmv/matrix-market/mm_to_bcsr.c

@@ -0,0 +1,346 @@
+#include "mm_to_bcsr.h"
+
+/* Some debug functions */
+
+static void print_block(tmp_block_t *block, unsigned r, unsigned c)
+{
+	printf(" **** block %d %d **** \n", block->i, block->j);
+
+	unsigned i, j;
+	for (j = 0; j < r; j++) {
+		for (i = 0; i < c; i++) {
+			printf("%2.2f\t", block->val[i + j*c]);
+		}
+		printf("\n");
+	}
+}
+
+static void print_all_blocks(tmp_block_t *block_list, unsigned r, unsigned c)
+{
+	tmp_block_t *current_block = block_list;
+
+	while(current_block) {
+		print_block(current_block, r, c);
+
+		current_block = current_block->next;
+	};
+}
+
+static void print_bcsr(bcsr_t *bcsr)
+{
+	fprintf(stderr, "** BSCR **\n");
+	fprintf(stderr, "non zero - blocks = %d\n", bcsr->nnz_blocks);
+	fprintf(stderr, "nrows - blocks = %d\n", bcsr->nrows_blocks);
+	fprintf(stderr, "block size : c %d r %d\n", bcsr->c, bcsr->r);
+}
+
+static unsigned count_blocks(tmp_block_t *block_list)
+{
+	unsigned count = 0;
+	tmp_block_t *current_block = block_list;
+
+	while(current_block) {
+		count++;
+		current_block = current_block->next;
+	};
+
+	return count;
+}
+
+static unsigned count_row_blocks(tmp_block_t *block_list)
+{
+	unsigned maxrow = 0;
+	tmp_block_t *current_block = block_list;
+
+	while(current_block) {
+		if (current_block->j > maxrow)
+			maxrow = current_block->j;
+
+		current_block = current_block->next;
+	};
+
+	return (maxrow+1);
+}
+
+
+
+/* Find the block that corresponds to (i,j) if it exists in the list */
+
+static tmp_block_t *search_block(tmp_block_t *block_list, unsigned i, unsigned j)
+{
+	tmp_block_t *current_block = block_list;
+	//printf("search %d %d\n", i, j);
+
+	while (current_block) {
+		if ((current_block->i == i) && (current_block->j == j)) 
+		{
+			/* we found the block */
+			return current_block;
+		}
+
+		current_block = current_block->next;
+	};
+
+	/* no entry was found ... */
+	return NULL;
+}
+
+static tmp_block_t *create_block(unsigned c, unsigned r)
+{
+	tmp_block_t *block;
+
+	block = malloc(sizeof(tmp_block_t));
+	block->val = calloc(c*r, sizeof(float));
+
+	return block;
+}
+
+/* determine if next block is bigger in lexical order */
+static unsigned next_block_is_bigger(tmp_block_t *block, unsigned i, unsigned j)
+{
+	tmp_block_t *next = block->next;
+
+	if (next)
+	{
+		/* we evaluate lexical order */
+		if (next->j < j)
+			return 0;
+
+		if (next->j > j)
+			return 1;
+
+		/* next->j == j */
+		return (next->i > i);
+	}
+
+	/* this is the last block, so it's bigger */
+	return 1;
+}
+
+/* we insert a block in the list, directly at the appropriate place */
+static void insert_block(tmp_block_t *block, tmp_block_t **block_list, unsigned i, unsigned j)
+{
+	///* insert block at the beginning of the list */
+	//block->next = *block_list;
+	//*block_list = block;
+
+	/* insert the block in lexicographical order */
+	/* first find an element that is bigger, then insert the block just before it */
+	tmp_block_t *current_block = *block_list;
+
+	if (!current_block) {
+		/* list was empty */
+		*block_list = block;
+		block->next = NULL;
+		return;
+	}
+
+	while (current_block) {
+		if (next_block_is_bigger(current_block, i, j)) {
+			/* insert block here */
+			block->next = current_block->next;
+			current_block->next = block;
+			return;
+		}
+
+		current_block = current_block->next;
+	};
+
+	/* should not be reached ! */
+}
+
+/* we add an element to the list of blocks, it is either added to an existing block or in a block specifically created if there was none */
+static void insert_elem(tmp_block_t **block_list, unsigned abs_i, unsigned abs_j, float val, unsigned c, unsigned r)
+{
+	/* we are looking for the block that contains (abs_i, abs_j) (abs = absolute) */
+	unsigned i,j;
+
+	i = abs_i / c;
+	j = abs_j / r;
+
+	tmp_block_t *block;
+
+	block = search_block(*block_list, i, j);
+
+	if (!block) {
+		/* the block does not exist yet */
+		/* create it */
+		block = create_block(c, r);
+
+		block->i = i;
+		block->j = j;
+		
+		//printf("create block %d %d !\n", i, j);
+
+		/* insert it in the block list */
+		insert_block(block, block_list, i, j);
+	}
+
+	/* now insert the value in the corresponding block */
+	unsigned local_i, local_j, local_index;
+
+	local_i = abs_i % c;
+	local_j = abs_j % r;
+	local_index = local_j * c + local_i;
+	
+	block->val[local_index] = val;
+}
+
+/* transform a list of values (with coordinates) into a list of blocks that are easily processed into BCSR */
+static tmp_block_t * mm_to_blocks(int nz, unsigned *I, unsigned *J, float *val, unsigned c, unsigned r)
+{
+	int elem;
+
+	/* at first, the list of block is empty */
+	tmp_block_t *block_list = NULL;
+
+	for (elem = 0; elem < nz; elem++)
+	{
+		insert_elem(&block_list, I[elem], J[elem], val[elem], c, r);
+	}
+
+	return block_list;
+}
+
+static void fill_bcsr(tmp_block_t *block_list, unsigned c, unsigned r, bcsr_t *bcsr)
+{
+	unsigned block = 0;
+	unsigned current_offset = 0;
+	size_t block_size = c*r*sizeof(float);
+
+	tmp_block_t *current_block = block_list;
+
+	while(current_block) {
+		/* copy the val from the block to the contiguous area in the BCSR */
+		memcpy(&bcsr->val[current_offset], current_block->val, block_size);
+
+		/* write the the index of the block 
+		 * XXX should it be in blocks ? */
+		bcsr->colind[block] = current_block->i;
+
+		if ((bcsr->rowptr[current_block->j] == 0) && (current_block->j != 0))
+		{
+			/* this is the first element of the line */
+			bcsr->rowptr[current_block->j] = block;
+		}
+
+		block++;
+		current_offset = block*c*r;
+		current_block = current_block->next;
+	};
+
+	/* for all lines where there were no block at all (XXX), fill the 0 in rowptr */
+	/* the first row must start at 0 ? */
+	bcsr->rowptr[0] = 0;
+
+	unsigned row;
+	for (row = 1; row < bcsr->nrows_blocks; row++)
+	{
+		if (bcsr->rowptr[row] == 0) 
+			bcsr->rowptr[row] = bcsr->rowptr[row-1];
+	}
+
+	bcsr->rowptr[bcsr->nrows_blocks] = bcsr->nnz_blocks;
+}
+
+static bcsr_t * blocks_to_bcsr(tmp_block_t *block_list, unsigned c, unsigned r)
+{
+	unsigned nblocks;
+
+	//print_all_blocks(block_list, r, c);
+
+	nblocks = count_blocks(block_list);
+
+	bcsr_t *bcsr = malloc(sizeof(bcsr_t));
+
+	bcsr->nnz_blocks = nblocks;
+	bcsr->r = r;
+	bcsr->c = c;
+	
+	unsigned nrows_blocks = count_row_blocks(block_list);
+	bcsr->nrows_blocks = nrows_blocks;
+
+	bcsr->val = malloc(nblocks*r*c*sizeof(float));
+	bcsr->colind = malloc(nblocks*sizeof(unsigned));
+	bcsr->rowptr = calloc((nrows_blocks + 1), sizeof(unsigned));
+
+	fill_bcsr(block_list, c, r, bcsr);
+
+	return bcsr;
+}
+
+bcsr_t *mm_to_bcsr(unsigned nz, unsigned *I, unsigned *J, float *val, unsigned c, unsigned r)
+{
+	bcsr_t *bcsr;
+   	tmp_block_t *block_list;
+
+	block_list = mm_to_blocks(nz, I, J, val, c, r);
+	bcsr = blocks_to_bcsr(block_list, c, r);
+
+	print_bcsr(bcsr);
+
+	return bcsr;
+}
+
+bcsr_t *mm_file_to_bcsr(char *filename, unsigned c, unsigned r)
+{
+	FILE *f;
+	MM_typecode matcode;
+	int ret_code;
+	int M, N;
+	int nz;   
+	int i;
+	unsigned *I, *J;
+	float *val;
+
+	bcsr_t *bcsr;
+
+	if ((f = fopen(filename, "r")) == NULL) 
+		exit(1);
+
+	if (mm_read_banner(f, &matcode) != 0)
+	{                                                       	
+		printf("Could not process Matrix Market banner.\n");
+		exit(1);                                            	
+	}
+
+	/*  This is how one can screen matrix types if their application */
+	/*  only supports a subset of the Matrix Market data types.      */
+	
+	if (mm_is_complex(matcode) && mm_is_matrix(matcode) &&  mm_is_sparse(matcode) )
+	{
+		printf("Sorry, this application does not support ");
+		printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));
+		exit(1);
+	}
+	
+	/* find out size of sparse matrix .... */
+	
+	if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) !=0)
+		exit(1);
+	
+	
+	/* reseve memory for matrices */
+	
+	I = malloc(nz * sizeof(unsigned));
+	J = malloc(nz * sizeof(unsigned));
+	/* XXX float ! */
+	val = (float *) malloc(nz * sizeof(float));
+	
+	for (i=0; i<nz; i++)
+	{
+		fscanf(f, "%d %d %f\n", &I[i], &J[i], &val[i]);
+		I[i]--;  /* adjust from 1-based to 0-based */
+		J[i]--;
+	}
+	
+	if (f !=stdin) fclose(f);
+	
+	bcsr = mm_to_bcsr((unsigned)nz, I, J, val, c, r);
+
+	free(I);
+	free(J);
+	free(val);
+
+	return bcsr;
+}

+ 36 - 0
examples/spmv/matrix-market/mm_to_bcsr.h

@@ -0,0 +1,36 @@
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "mmio.h"
+
+/* convert a matrix stored in a file with the matrix market format into the 
+ * BCSR format */
+
+typedef struct tmp_block {
+	/* we have a linked list of blocks */
+	struct tmp_block *next;
+
+	/* column i, row j*/
+	unsigned i, j;
+	
+	float *val;
+
+} tmp_block_t;
+
+typedef struct {
+	unsigned r,c;
+	unsigned nnz_blocks;
+	unsigned nrows_blocks;
+
+	float *val;
+	uint32_t *colind;
+	uint32_t *rowptr;
+} bcsr_t;
+
+
+/* directly read input from a file */
+bcsr_t *mm_file_to_bcsr(char *filename, unsigned c, unsigned r);
+
+/* read the matrix as a set of valuated coordinates */
+bcsr_t *mm_to_bcsr(unsigned nz, unsigned *I, unsigned *J, float *val, unsigned c, unsigned r);

+ 512 - 0
examples/spmv/matrix-market/mmio.c

@@ -0,0 +1,512 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "mmio.h"
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_)
+{
+    FILE *f;
+    MM_typecode matcode;
+    int M, N, nz;
+    int i;
+    double *val;
+    int *I, *J;
+ 
+    if ((f = fopen(fname, "r")) == NULL)
+            return -1;
+ 
+ 
+    if (mm_read_banner(f, &matcode) != 0)
+    {
+        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+        printf(" in file [%s]\n", fname);
+        return -1;
+    }
+ 
+ 
+ 
+    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+            mm_is_sparse(matcode)))
+    {
+        fprintf(stderr, "Sorry, this application does not support ");
+        fprintf(stderr, "Market Market type: [%s]\n",
+                mm_typecode_to_str(matcode));
+        return -1;
+    }
+ 
+    /* find out size of sparse matrix: M, N, nz .... */
+ 
+    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
+    {
+        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
+        return -1;
+    }
+ 
+    *M_ = M;
+    *N_ = N;
+    *nz_ = nz;
+ 
+    /* reseve memory for matrices */
+ 
+    I = (int *) malloc(nz * sizeof(int));
+    J = (int *) malloc(nz * sizeof(int));
+    val = (double *) malloc(nz * sizeof(double));
+ 
+    *val_ = val;
+    *I_ = I;
+    *J_ = J;
+ 
+    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+ 
+    for (i=0; i<nz; i++)
+    {
+        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+        I[i]--;  /* adjust from 1-based to 0-based */
+        J[i]--;
+    }
+    fclose(f);
+ 
+    return 0;
+}
+
+int mm_is_valid(MM_typecode matcode)
+{
+    if (!mm_is_matrix(matcode)) return 0;
+    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
+    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
+    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
+                mm_is_skew(matcode))) return 0;
+    return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    char banner[MM_MAX_TOKEN_LENGTH];
+    char mtx[MM_MAX_TOKEN_LENGTH]; 
+    char crd[MM_MAX_TOKEN_LENGTH];
+    char data_type[MM_MAX_TOKEN_LENGTH];
+    char storage_scheme[MM_MAX_TOKEN_LENGTH];
+    char *p;
+
+
+    mm_clear_typecode(matcode);  
+
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
+        return MM_PREMATURE_EOF;
+
+    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
+        storage_scheme) != 5)
+        return MM_PREMATURE_EOF;
+
+    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
+    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
+    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
+    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
+
+    /* check for banner */
+    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+        return MM_NO_HEADER;
+
+    /* first field should be "mtx" */
+    if (strcmp(mtx, MM_MTX_STR) != 0)
+        return  MM_UNSUPPORTED_TYPE;
+    mm_set_matrix(matcode);
+
+
+    /* second field describes whether this is a sparse matrix (in coordinate
+            storgae) or a dense array */
+
+
+    if (strcmp(crd, MM_SPARSE_STR) == 0)
+        mm_set_sparse(matcode);
+    else
+    if (strcmp(crd, MM_DENSE_STR) == 0)
+            mm_set_dense(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* third field */
+
+    if (strcmp(data_type, MM_REAL_STR) == 0)
+        mm_set_real(matcode);
+    else
+    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+        mm_set_complex(matcode);
+    else
+    if (strcmp(data_type, MM_PATTERN_STR) == 0)
+        mm_set_pattern(matcode);
+    else
+    if (strcmp(data_type, MM_INT_STR) == 0)
+        mm_set_integer(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+    
+
+    /* fourth field */
+
+    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+        mm_set_general(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+        mm_set_symmetric(matcode);
+    else
+    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+        mm_set_hermitian(matcode);
+    else
+    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+        mm_set_skew(matcode);
+    else
+        return MM_UNSUPPORTED_TYPE;
+        
+
+    return 0;
+}
+
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
+{
+    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = *nz = 0;
+
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+        return 0;
+        
+    else
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 3);
+
+    return 0;
+}
+
+
+int mm_read_mtx_array_size(FILE *f, int *M, int *N)
+{
+    char line[MM_MAX_LINE_LENGTH];
+    int num_items_read;
+    /* set return null parameter values, in case we exit with errors */
+    *M = *N = 0;
+	
+    /* now continue scanning until you reach the end-of-comments */
+    do 
+    {
+        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
+            return MM_PREMATURE_EOF;
+    }while (line[0] == '%');
+
+    /* line[] is either blank or has M,N, nz */
+    if (sscanf(line, "%d %d", M, N) == 2)
+        return 0;
+        
+    else /* we have a blank line */
+    do
+    { 
+        num_items_read = fscanf(f, "%d %d", M, N); 
+        if (num_items_read == EOF) return MM_PREMATURE_EOF;
+    }
+    while (num_items_read != 2);
+
+    return 0;
+}
+
+int mm_write_mtx_array_size(FILE *f, int M, int N)
+{
+    if (fprintf(f, "%d %d\n", M, N) != 2)
+        return MM_COULD_NOT_WRITE_FILE;
+    else 
+        return 0;
+}
+
+
+
+/*-------------------------------------------------------------------------*/
+
+/******************************************************************/
+/* use when I[], J[], and val[]J, and val[] are already allocated */
+/******************************************************************/
+
+int mm_read_mtx_crd_data(FILE *f, int M __attribute__ ((unused)) , 
+				 int N __attribute__ ((unused)) , int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    int i;
+    if (mm_is_complex(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+        for (i=0; i<nz; i++)
+        {
+            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
+                != 3) return MM_PREMATURE_EOF;
+
+        }
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+        for (i=0; i<nz; i++)
+            if (fscanf(f, "%d %d", &I[i], &J[i])
+                != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
+        double *real, double *imag, MM_typecode matcode)
+{
+    if (mm_is_complex(matcode))
+    {
+            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
+                != 4) return MM_PREMATURE_EOF;
+    }
+    else if (mm_is_real(matcode))
+    {
+            if (fscanf(f, "%d %d %lg\n", I, J, real)
+                != 3) return MM_PREMATURE_EOF;
+
+    }
+
+    else if (mm_is_pattern(matcode))
+    {
+            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
+    }
+    else
+        return MM_UNSUPPORTED_TYPE;
+
+    return 0;
+        
+}
+
+
+/************************************************************************
+    mm_read_mtx_crd()  fills M, N, nz, array of values, and return
+                        type code, e.g. 'MCRS'
+
+                        if matrix is complex, values[] is of size 2*nz,
+                            (nz pairs of real/imaginary values)
+************************************************************************/
+
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
+        double **val, MM_typecode *matcode)
+{
+    int ret_code;
+    FILE *f;
+
+    if (strcmp(fname, "stdin") == 0) f=stdin;
+    else
+    if ((f = fopen(fname, "r")) == NULL)
+        return MM_COULD_NOT_READ_FILE;
+
+
+    if ((ret_code = mm_read_banner(f, matcode)) != 0)
+        return ret_code;
+
+    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
+            mm_is_matrix(*matcode)))
+        return MM_UNSUPPORTED_TYPE;
+
+    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+        return ret_code;
+
+
+    *I = (int *)  malloc(*nz * sizeof(int));
+    *J = (int *)  malloc(*nz * sizeof(int));
+    *val = NULL;
+
+    if (mm_is_complex(*matcode))
+    {
+        *val = (double *) malloc(*nz * 2 * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+    else if (mm_is_real(*matcode))
+    {
+        *val = (double *) malloc(*nz * sizeof(double));
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    else if (mm_is_pattern(*matcode))
+    {
+        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
+                *matcode);
+        if (ret_code != 0) return ret_code;
+    }
+
+    if (f != stdin) fclose(f);
+    return 0;
+}
+
+int mm_write_banner(FILE *f, MM_typecode matcode)
+{
+    char *str = mm_typecode_to_str(matcode);
+    int ret_code;
+
+    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+    free(str);
+    if (ret_code !=2 )
+        return MM_COULD_NOT_WRITE_FILE;
+    else
+        return 0;
+}
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+        double val[], MM_typecode matcode)
+{
+    FILE *f;
+    int i;
+
+    if (strcmp(fname, "stdout") == 0) 
+        f = stdout;
+    else
+    if ((f = fopen(fname, "w")) == NULL)
+        return MM_COULD_NOT_WRITE_FILE;
+    
+    /* print banner followed by typecode */
+    fprintf(f, "%s ", MatrixMarketBanner);
+    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+    /* print matrix sizes and nonzeros */
+    fprintf(f, "%d %d %d\n", M, N, nz);
+
+    /* print values */
+    if (mm_is_pattern(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d\n", I[i], J[i]);
+    else
+    if (mm_is_real(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+    else
+    if (mm_is_complex(matcode))
+        for (i=0; i<nz; i++)
+            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
+                        val[2*i+1]);
+    else
+    {
+        if (f != stdout) fclose(f);
+        return MM_UNSUPPORTED_TYPE;
+    }
+
+    if (f !=stdout) fclose(f);
+
+    return 0;
+}
+  
+
+/**
+*  Create a new copy of a string s.  mm_strdup() is a common routine, but
+*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+*
+*/
+char *mm_strdup(const char *s)
+{
+	int len = strlen(s);
+	char *s2 = (char *) malloc((len+1)*sizeof(char));
+	return strcpy(s2, s);
+}
+
+char  *mm_typecode_to_str(MM_typecode matcode)
+{
+    char buffer[MM_MAX_LINE_LENGTH];
+    char *types[4];
+//	char *mm_strdup(const char *);
+    int error =0;
+
+    /* check for MTX type */
+    if (mm_is_matrix(matcode)) 
+        types[0] = MM_MTX_STR;
+    else
+        error=1;
+
+    /* check for CRD or ARR matrix */
+    if (mm_is_sparse(matcode))
+        types[1] = MM_SPARSE_STR;
+    else
+    if (mm_is_dense(matcode))
+        types[1] = MM_DENSE_STR;
+    else
+        return NULL;
+
+    /* check for element data type */
+    if (mm_is_real(matcode))
+        types[2] = MM_REAL_STR;
+    else
+    if (mm_is_complex(matcode))
+        types[2] = MM_COMPLEX_STR;
+    else
+    if (mm_is_pattern(matcode))
+        types[2] = MM_PATTERN_STR;
+    else
+    if (mm_is_integer(matcode))
+        types[2] = MM_INT_STR;
+    else
+        return NULL;
+
+
+    /* check for symmetry type */
+    if (mm_is_general(matcode))
+        types[3] = MM_GENERAL_STR;
+    else
+    if (mm_is_symmetric(matcode))
+        types[3] = MM_SYMM_STR;
+    else 
+    if (mm_is_hermitian(matcode))
+        types[3] = MM_HERM_STR;
+    else 
+    if (mm_is_skew(matcode))
+        types[3] = MM_SKEW_STR;
+    else
+        return NULL;
+
+    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
+    return mm_strdup(buffer);
+
+}

+ 133 - 0
examples/spmv/matrix-market/mmio.h

@@ -0,0 +1,133 @@
+/* 
+*   Matrix Market I/O library for ANSI C
+*
+*   See http://math.nist.gov/MatrixMarket for details.
+*
+*
+*/
+
+#ifndef MM_IO_H
+#define MM_IO_H
+
+#define MM_MAX_LINE_LENGTH 1025
+#define MatrixMarketBanner "%%MatrixMarket"
+#define MM_MAX_TOKEN_LENGTH 64
+
+typedef char MM_typecode[4];
+
+char *mm_typecode_to_str(MM_typecode matcode);
+
+int mm_read_banner(FILE *f, MM_typecode *matcode);
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
+int mm_read_mtx_array_size(FILE *f, int *M, int *N);
+
+int mm_write_banner(FILE *f, MM_typecode matcode);
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
+int mm_write_mtx_array_size(FILE *f, int M, int N);
+
+
+/********************* MM_typecode query fucntions ***************************/
+
+#define mm_is_matrix(typecode)	((typecode)[0]=='M')
+
+#define mm_is_sparse(typecode)	((typecode)[1]=='C')
+#define mm_is_coordinate(typecode)((typecode)[1]=='C')
+#define mm_is_dense(typecode)	((typecode)[1]=='A')
+#define mm_is_array(typecode)	((typecode)[1]=='A')
+
+#define mm_is_complex(typecode)	((typecode)[2]=='C')
+#define mm_is_real(typecode)		((typecode)[2]=='R')
+#define mm_is_pattern(typecode)	((typecode)[2]=='P')
+#define mm_is_integer(typecode) ((typecode)[2]=='I')
+
+#define mm_is_symmetric(typecode)((typecode)[3]=='S')
+#define mm_is_general(typecode)	((typecode)[3]=='G')
+#define mm_is_skew(typecode)	((typecode)[3]=='K')
+#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+
+int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+
+
+/********************* MM_typecode modify fucntions ***************************/
+
+#define mm_set_matrix(typecode)	((*typecode)[0]='M')
+#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
+#define mm_set_array(typecode)	((*typecode)[1]='A')
+#define mm_set_dense(typecode)	mm_set_array(typecode)
+#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
+
+#define mm_set_complex(typecode)((*typecode)[2]='C')
+#define mm_set_real(typecode)	((*typecode)[2]='R')
+#define mm_set_pattern(typecode)((*typecode)[2]='P')
+#define mm_set_integer(typecode)((*typecode)[2]='I')
+
+
+#define mm_set_symmetric(typecode)((*typecode)[3]='S')
+#define mm_set_general(typecode)((*typecode)[3]='G')
+#define mm_set_skew(typecode)	((*typecode)[3]='K')
+#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+
+#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
+									(*typecode)[2]=' ',(*typecode)[3]='G')
+
+#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
+
+
+/********************* Matrix Market error codes ***************************/
+
+
+#define MM_COULD_NOT_READ_FILE	11
+#define MM_PREMATURE_EOF		12
+#define MM_NOT_MTX				13
+#define MM_NO_HEADER			14
+#define MM_UNSUPPORTED_TYPE		15
+#define MM_LINE_TOO_LONG		16
+#define MM_COULD_NOT_WRITE_FILE	17
+
+
+/******************** Matrix Market internal definitions ********************
+
+   MM_matrix_typecode: 4-character sequence
+
+				    ojbect 		sparse/   	data        storage 
+						  		dense     	type        scheme
+
+   string position:	 [0]        [1]			[2]         [3]
+
+   Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
+						        A(array)	C(omplex)   H(ermitian)
+											P(attern)   S(ymmetric)
+								    		I(nteger)	K(kew)
+
+ ***********************************************************************/
+
+#define MM_MTX_STR		"matrix"
+#define MM_ARRAY_STR	"array"
+#define MM_DENSE_STR	"array"
+#define MM_COORDINATE_STR "coordinate" 
+#define MM_SPARSE_STR	"coordinate"
+#define MM_COMPLEX_STR	"complex"
+#define MM_REAL_STR		"real"
+#define MM_INT_STR		"integer"
+#define MM_GENERAL_STR  "general"
+#define MM_SYMM_STR		"symmetric"
+#define MM_HERM_STR		"hermitian"
+#define MM_SKEW_STR		"skew-symmetric"
+#define MM_PATTERN_STR  "pattern"
+
+
+/*  high level routines */
+
+int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
+		 double val[], MM_typecode matcode);
+int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
+		double val[], MM_typecode matcode);
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
+			MM_typecode matcode);
+
+int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
+                double **val_, int **I_, int **J_);
+
+
+
+#endif

+ 43 - 0
examples/strassen/Makefile.in

@@ -0,0 +1,43 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+LDFLAGS+= ../common/blas.o
+
+all: dw_strassen
+
+strassen_kernels.o: strassen.h strassen_kernels.c
+	$(CC) $(CFLAGS) strassen_kernels.c -c -o strassen_kernels.o
+
+strassen.o: strassen.h strassen.c
+	$(CC) $(CFLAGS) strassen.c -c -o strassen.o
+
+test_strassen.o: test_strassen.c strassen.h
+	$(CC) $(CFLAGS) test_strassen.c -c -o test_strassen.o
+
+strassen_models.o: strassen_models.c strassen_models.h
+	$(CC) $(CFLAGS) strassen_models.c -c -o strassen_models.o
+
+dw_strassen: $(STARPU) strassen_models.o strassen.o strassen_kernels.o test_strassen.o ../common/blas.o
+	$(CC) strassen_models.o strassen.o strassen_kernels.o test_strassen.o -o dw_strassen $(LDFLAGS) $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f dw_strassen

+ 515 - 0
examples/strassen/strassen.c

@@ -0,0 +1,515 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "strassen.h"
+#include "strassen_models.h"
+
+static starpu_data_handle create_tmp_matrix(starpu_data_handle M)
+{
+	float *data;
+	starpu_data_handle state = malloc(sizeof(starpu_data_handle));
+
+	/* create a matrix with the same dimensions as M */
+	uint32_t nx = starpu_get_blas_nx(M);
+	uint32_t ny = starpu_get_blas_nx(M);
+
+	STARPU_ASSERT(state);
+
+	data = malloc(nx*ny*sizeof(float));
+	STARPU_ASSERT(data);
+
+	starpu_monitor_blas_data(&state, 0, (uintptr_t)data, nx, nx, ny, sizeof(float));
+	
+	return state;
+}
+
+static void free_tmp_matrix(starpu_data_handle matrix)
+{
+	starpu_delete_data(matrix);
+	free(matrix);
+}
+
+
+static void partition_matrices(strassen_iter_state_t *iter)
+{
+
+	starpu_data_handle A = iter->A;
+	starpu_data_handle B = iter->B;
+	starpu_data_handle C = iter->C;
+
+	starpu_filter f;
+	f.filter_func = starpu_block_filter_func;
+	f.filter_arg = 2;
+
+	starpu_filter f2;
+	f2.filter_func = starpu_vertical_block_filter_func;
+	f2.filter_arg = 2;
+
+	starpu_map_filters(A, 2, &f, &f2);
+	starpu_map_filters(B, 2, &f, &f2);
+	starpu_map_filters(C, 2, &f, &f2);
+
+	iter->A11 = get_sub_data(A, 2, 0, 0);
+	iter->A12 = get_sub_data(A, 2, 1, 0);
+	iter->A21 = get_sub_data(A, 2, 0, 1);
+	iter->A22 = get_sub_data(A, 2, 1, 1);
+
+	iter->B11 = get_sub_data(B, 2, 0, 0);
+	iter->B12 = get_sub_data(B, 2, 1, 0);
+	iter->B21 = get_sub_data(B, 2, 0, 1);
+	iter->B22 = get_sub_data(B, 2, 1, 1);
+
+	iter->C11 = get_sub_data(C, 2, 0, 0);
+	iter->C12 = get_sub_data(C, 2, 1, 0);
+	iter->C21 = get_sub_data(C, 2, 0, 1);
+	iter->C22 = get_sub_data(C, 2, 1, 1);
+
+	/* TODO check that all sub-matrices have the same size */
+}
+
+static void unpartition_matrices(strassen_iter_state_t *iter)
+{
+	/* TODO there is no  need to actually gather those results ... */
+	starpu_unpartition_data(iter->A, 0);
+	starpu_unpartition_data(iter->B, 0);
+	starpu_unpartition_data(iter->C, 0);
+}
+
+static starpu_codelet cl_add = {
+	.where = ANY,
+	.model = &strassen_model_add_sub,
+	.core_func = add_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = add_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+static starpu_codelet cl_sub = {
+	.where = ANY,
+	.model = &strassen_model_add_sub,
+	.core_func = sub_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = sub_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+static starpu_codelet cl_mult = {
+	.where = ANY,
+	.model = &strassen_model_mult,
+	.core_func = mult_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = mult_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+static starpu_codelet cl_self_add = {
+	.where = ANY,
+	.model = &strassen_model_self_add_sub,
+	.core_func = self_add_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = self_add_cublas_codelet,
+#endif
+	.nbuffers = 2
+};
+
+static starpu_codelet cl_self_sub = {
+	.where = ANY,
+	.model = &strassen_model_self_add_sub,
+	.core_func = self_sub_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = self_sub_cublas_codelet,
+#endif
+	.nbuffers = 2
+};
+
+static void compute_add_sub_op(starpu_data_handle A1, operation op,
+				starpu_data_handle A2, starpu_data_handle C, 
+				void (*callback)(void *), void *argcallback)
+{
+	/* performs C = (A op B) */
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 0;
+
+	task->buffers[0].state = C;
+	task->buffers[0].mode = W;
+	task->buffers[1].state = A1;
+	task->buffers[1].mode = R;
+	task->buffers[2].state = A2;
+	task->buffers[2].mode = R;
+	
+	task->callback_func = callback;
+	task->callback_arg = argcallback;
+
+	switch (op) {
+		case ADD:
+			STARPU_ASSERT(A1);
+			STARPU_ASSERT(A2);
+			STARPU_ASSERT(C);
+			task->cl = &cl_add;
+			break;
+		case SUB:
+			STARPU_ASSERT(A1);
+			STARPU_ASSERT(A2);
+			STARPU_ASSERT(C);
+			task->cl = &cl_sub;
+			break;
+		case MULT:
+			STARPU_ASSERT(A1);
+			STARPU_ASSERT(A2);
+			STARPU_ASSERT(C);
+			task->cl = &cl_mult;
+			break;
+		case SELFADD:
+			task->buffers[0].mode = RW;
+			task->cl = &cl_self_add;
+			break;
+		case SELFSUB:
+			task->buffers[0].mode = RW;
+			task->cl = &cl_self_sub;
+			break;
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	starpu_submit_task(task);
+}
+
+/* Cij +=/-= Ek is done */
+void phase_3_callback_function(void *_arg)
+{
+	unsigned cnt, use_cnt;
+	phase3_t *arg = _arg;
+
+	unsigned i = arg->i;
+	strassen_iter_state_t *iter = arg->iter;
+
+	free(arg);
+
+	use_cnt = STARPU_ATOMIC_ADD(&iter->Ei_remaining_use[i], -1);
+	if (use_cnt == 0) 
+	{
+		/* no one needs Ei anymore : free it */
+		switch (i) {
+			case 0:
+				free_tmp_matrix(iter->E1);
+				break;
+			case 1:
+				free_tmp_matrix(iter->E2);
+				break;
+			case 2:
+				free_tmp_matrix(iter->E3);
+				break;
+			case 3:
+				free_tmp_matrix(iter->E4);
+				break;
+			case 4:
+				free_tmp_matrix(iter->E5);
+				break;
+			case 5:
+				free_tmp_matrix(iter->E6);
+				break;
+			case 6:
+				free_tmp_matrix(iter->E7);
+				break;
+			default:
+				STARPU_ASSERT(0);
+		}
+	}
+
+	cnt = STARPU_ATOMIC_ADD(&iter->counter, -1);
+	if (cnt == 0)
+	{
+		/* the entire strassen iteration is done ! */
+		unpartition_matrices(iter);
+
+		// XXX free the Ei
+		STARPU_ASSERT(iter->strassen_iter_callback);
+		iter->strassen_iter_callback(iter->argcb);
+
+		free(iter);
+	}
+}
+
+
+
+/* Ei is computed */
+void phase_2_callback_function(void *_arg)
+{
+	phase2_t *arg = _arg;
+
+	strassen_iter_state_t *iter = arg->iter;
+	unsigned i = arg->i;
+
+	free(arg);
+
+	phase3_t *arg1, *arg2;
+	arg1 = malloc(sizeof(phase3_t));
+	arg2 = malloc(sizeof(phase3_t));
+
+	arg1->iter = iter;
+	arg2->iter = iter;
+
+	arg1->i = i;
+	arg2->i = i;
+
+	switch (i) {
+		case 0:
+			free(arg2); // will not be needed .. 
+			free_tmp_matrix(iter->E11);
+			free_tmp_matrix(iter->E12);
+			/* C11 += E1 */
+			compute_add_sub_op(iter->E1, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
+			break;
+		case 1:
+			free_tmp_matrix(iter->E21);
+			free_tmp_matrix(iter->E22);
+			/* C11 += E2 */
+			compute_add_sub_op(iter->E2, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
+			/* C22 += E2 */
+			compute_add_sub_op(iter->E2, SELFADD, NULL, iter->C22, phase_3_callback_function, arg2);
+			break;
+		case 2:
+			free(arg2); // will not be needed .. 
+			free_tmp_matrix(iter->E31);
+			free_tmp_matrix(iter->E32);
+			/* C22 -= E3 */
+			compute_add_sub_op(iter->E3, SELFSUB, NULL, iter->C22, phase_3_callback_function, arg1);
+			break;
+		case 3:
+			free_tmp_matrix(iter->E41);
+			/* C11 -= E4 */
+			compute_add_sub_op(iter->E4, SELFSUB, NULL, iter->C11, phase_3_callback_function, arg1);
+			/* C12 += E4 */
+			compute_add_sub_op(iter->E4, SELFADD, NULL, iter->C12, phase_3_callback_function, arg2);
+			break;
+		case 4:
+			free_tmp_matrix(iter->E52);
+			/* C12 += E5 */
+			compute_add_sub_op(iter->E5, SELFADD, NULL, iter->C12, phase_3_callback_function, arg1);
+			/* C22 += E5 */
+			compute_add_sub_op(iter->E5, SELFADD, NULL, iter->C22, phase_3_callback_function, arg2);
+			break;
+		case 5:
+			free_tmp_matrix(iter->E62);
+			/* C11 += E6 */
+			compute_add_sub_op(iter->E6, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
+			/* C21 += E6 */
+			compute_add_sub_op(iter->E6, SELFADD, NULL, iter->C21, phase_3_callback_function, arg2);
+			break;
+		case 6:
+			free_tmp_matrix(iter->E71);
+			/* C21 += E7 */
+			compute_add_sub_op(iter->E7, SELFADD, NULL, iter->C21, phase_3_callback_function, arg1);
+			/* C22 -= E7 */
+			compute_add_sub_op(iter->E7, SELFSUB, NULL, iter->C22, phase_3_callback_function, arg2);
+			break;
+		default:
+			STARPU_ASSERT(0);
+	}
+}
+
+
+/* computes Ei */
+static void _strassen_phase_2(strassen_iter_state_t *iter, unsigned i)
+{
+	phase2_t *phase_2_arg = malloc(sizeof(phase2_t));
+
+	phase_2_arg->iter = iter;
+	phase_2_arg->i = i;
+
+	/* XXX */
+	starpu_data_handle A;
+	starpu_data_handle B;
+	starpu_data_handle C;
+
+	switch (i) {
+		case 0:
+			A = iter->E11; B = iter->E12;
+			iter->E1 = create_tmp_matrix(A);
+			C = iter->E1;
+			break;
+		case 1:
+			A = iter->E21; B = iter->E22;
+			iter->E2 = create_tmp_matrix(A);
+			C = iter->E2;
+			break;
+		case 2:
+			A = iter->E31; B = iter->E32;
+			iter->E3 = create_tmp_matrix(A);
+			C = iter->E3;
+			break;
+		case 3:
+			A = iter->E41; B = iter->E42;
+			iter->E4 = create_tmp_matrix(A);
+			C = iter->E4;
+			break;
+		case 4:
+			A = iter->E51; B = iter->E52;
+			iter->E5 = create_tmp_matrix(A);
+			C = iter->E5;
+			break;
+		case 5:
+			A = iter->E61; B = iter->E62;
+			iter->E6 = create_tmp_matrix(A);
+			C = iter->E6;
+			break;
+		case 6:
+			A = iter->E71; B = iter->E72;
+			iter->E7 = create_tmp_matrix(A);
+			C = iter->E7;
+			break;
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	STARPU_ASSERT(A);
+	STARPU_ASSERT(B);
+	STARPU_ASSERT(C);
+
+	// DEBUG XXX
+	//compute_add_sub_op(A, MULT, B, C, phase_2_callback_function, phase_2_arg);
+	strassen(A, B, C, phase_2_callback_function, phase_2_arg, iter->reclevel-1);
+}
+
+
+#define THRESHHOLD	128
+
+static void phase_1_callback_function(void *_arg)
+{
+
+	phase1_t *arg = _arg;
+	strassen_iter_state_t *iter = arg->iter;
+	unsigned i = arg->i;
+
+	free(arg);
+
+	unsigned cnt = STARPU_ATOMIC_ADD(&iter->Ei12[i], +1);
+
+	if (cnt == 2) {
+		/* Ei1 and Ei2 are ready, compute Ei */
+		_strassen_phase_2(iter, i);
+	}
+}
+
+/* computes Ei1 or Ei2 with i in 0-6 */
+static void _strassen_phase_1(starpu_data_handle A1, operation opA, starpu_data_handle A2,
+			      starpu_data_handle C, strassen_iter_state_t *iter, unsigned i)
+{
+	phase1_t *phase_1_arg = malloc(sizeof(phase1_t));
+	phase_1_arg->iter = iter;
+	phase_1_arg->i = i;
+
+	compute_add_sub_op(A1, opA, A2, C, phase_1_callback_function, phase_1_arg);
+}
+
+strassen_iter_state_t *init_strassen_iter_state(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*strassen_iter_callback)(void *), void *argcb)
+{
+	strassen_iter_state_t *iter_state = malloc(sizeof(strassen_iter_state_t));
+
+	iter_state->Ei12[0] = 0;
+	iter_state->Ei12[1] = 0;
+	iter_state->Ei12[2] = 0;
+	iter_state->Ei12[3] = 1; // E42 = B22
+	iter_state->Ei12[4] = 1; // E51 = A11
+	iter_state->Ei12[5] = 1; // E61 = A22
+	iter_state->Ei12[6] = 1; // E72 = B11
+
+	iter_state->Ei_remaining_use[0] = 1; 
+	iter_state->Ei_remaining_use[1] = 2;
+	iter_state->Ei_remaining_use[2] = 1;
+	iter_state->Ei_remaining_use[3] = 2;
+	iter_state->Ei_remaining_use[4] = 2;
+	iter_state->Ei_remaining_use[5] = 2;
+	iter_state->Ei_remaining_use[6] = 2;
+
+	unsigned i;
+	for (i = 0; i < 6; i++)
+	{
+		iter_state->Ei[i] = 0;
+	}
+
+	for (i = 0; i < 4; i++)
+	{
+		iter_state->Cij[i] = 0;
+	}
+
+	iter_state->strassen_iter_callback = strassen_iter_callback;
+	iter_state->argcb = argcb;
+
+	iter_state->A = A;
+	iter_state->B = B;
+	iter_state->C = C;
+
+	iter_state->counter = 12;
+
+	return iter_state;
+}
+
+static void _do_strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*strassen_iter_callback)(void *), void *argcb, unsigned reclevel)
+{
+	/* do one level of recursion in the strassen algorithm */
+	strassen_iter_state_t *iter = init_strassen_iter_state(A, B, C, strassen_iter_callback, argcb);
+
+	partition_matrices(iter);
+	iter->reclevel = reclevel;
+
+	/* some Eij are already known */
+	iter->E11 = create_tmp_matrix(iter->A11);
+	iter->E12 = create_tmp_matrix(iter->B21);
+	iter->E21 = create_tmp_matrix(iter->A11);
+	iter->E22 = create_tmp_matrix(iter->B11);
+	iter->E31 = create_tmp_matrix(iter->A11);
+	iter->E32 = create_tmp_matrix(iter->B11);
+	iter->E41 = create_tmp_matrix(iter->A11);
+	iter->E42 = iter->B22;
+	iter->E51 = iter->A11;
+	iter->E52 = create_tmp_matrix(iter->B12);
+	iter->E61 = iter->A22;
+	iter->E62 = create_tmp_matrix(iter->B21);
+	iter->E71 = create_tmp_matrix(iter->A21);
+	iter->E72 = iter->B11;
+
+	/* compute all Eij */
+	_strassen_phase_1(iter->A11, SUB, iter->A22, iter->E11, iter, 0);
+	_strassen_phase_1(iter->B21, ADD, iter->B22, iter->E12, iter, 0);
+	_strassen_phase_1(iter->A11, ADD, iter->A22, iter->E21, iter, 1);
+	_strassen_phase_1(iter->B11, ADD, iter->B22, iter->E22, iter, 1);
+	_strassen_phase_1(iter->A11, SUB, iter->A21, iter->E31, iter, 2);
+	_strassen_phase_1(iter->B11, ADD, iter->B12, iter->E32, iter, 2);
+	_strassen_phase_1(iter->A11, ADD, iter->A12, iter->E41, iter, 3);
+	_strassen_phase_1(iter->B12, SUB, iter->B22, iter->E52, iter, 4);
+	_strassen_phase_1(iter->B21, SUB, iter->B11, iter->E62, iter, 5);
+	_strassen_phase_1(iter->A21, ADD, iter->A22, iter->E71, iter, 6);
+}
+
+
+void strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*callback)(void *), void *argcb, unsigned reclevel)
+{
+	/* C = A * B */
+	if ( reclevel == 0 )
+	{
+		/* don't use Strassen but a simple sequential multiplication
+		 * provided this is small enough */
+		compute_add_sub_op(A, MULT, B, C, callback, argcb);
+	}
+	else {
+		_do_strassen(A, B, C, callback, argcb, reclevel);
+	}
+}

+ 114 - 0
examples/strassen/strassen.h

@@ -0,0 +1,114 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STRASSEN_H__
+#define __STRASSEN_H__
+
+#include <semaphore.h>
+#include <sys/time.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <cblas.h>
+
+#include <starpu_config.h>
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cublas.h>
+#endif
+
+#include <starpu.h>
+
+typedef enum {
+	ADD,
+	SUB,
+	MULT,
+	SELFADD,
+	SELFSUB,
+	NONE
+} operation;
+
+typedef struct {
+	/* monitor the progress of the algorithm */
+	unsigned Ei12[7]; // Ei12[k] is 0, 1 or 2 (2 = finished Ei1 and Ei2)
+	unsigned Ei[7];
+	unsigned Ei_remaining_use[7];
+	unsigned Cij[4];
+
+	starpu_data_handle A, B, C;
+	starpu_data_handle A11, A12, A21, A22;
+	starpu_data_handle B11, B12, B21, B22;
+	starpu_data_handle C11, C12, C21, C22;
+
+	starpu_data_handle E1, E2, E3, E4, E5, E6, E7;
+	starpu_data_handle E11, E12, E21, E22, E31, E32, E41, E52, E62, E71;
+
+	starpu_data_handle E42, E51, E61, E72;
+
+	unsigned reclevel;
+	
+	/* */
+	unsigned counter;
+
+	/* called at the end of the iteration */
+	void (*strassen_iter_callback)(void *);
+	void *argcb;
+} strassen_iter_state_t;
+
+typedef struct {
+	strassen_iter_state_t *iter;
+
+	/* phase 1 computes Ei1 or Ei2 with i in 0-6 */
+	unsigned i;
+} phase1_t;
+
+typedef struct {
+	strassen_iter_state_t *iter;
+
+	/* phase 2 computes Ei with i in 0-6 */
+	unsigned i;
+} phase2_t;
+
+typedef struct {
+	strassen_iter_state_t *iter;
+
+	/* phase 2 computes Ei with i in 0-6 */
+	unsigned i;
+} phase3_t;
+
+void mult_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void self_add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void self_sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+
+#ifdef USE_CUDA
+void mult_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void self_add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+void self_sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+#endif
+
+void strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*callback)(void *), void *argcb, unsigned reclevel);
+
+extern struct starpu_perfmodel_t strassen_model_mult;
+extern struct starpu_perfmodel_t strassen_model_add_sub;
+extern struct starpu_perfmodel_t strassen_model_self_add_sub;
+
+#endif // __STRASSEN_H__

+ 198 - 0
examples/strassen/strassen_kernels.c

@@ -0,0 +1,198 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "strassen.h"
+
+
+static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg)
+{
+	float *center 	= (float *)buffers[0].blas.ptr;
+	float *left 	= (float *)buffers[1].blas.ptr;
+	float *right 	= (float *)buffers[2].blas.ptr;
+
+	unsigned dx = buffers[0].blas.nx;
+	unsigned dy = buffers[0].blas.ny;
+	unsigned dz = buffers[1].blas.nx;
+
+	unsigned ld21 = buffers[1].blas.ld;
+	unsigned ld12 = buffers[2].blas.ld;
+	unsigned ld22 = buffers[0].blas.ld;
+
+	switch (s) {
+		case 0:
+			cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, 
+				dy, dx, dz, -1.0f, left, ld21, right, ld12,
+					     1.0f, center, ld22);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublasSgemm('t', 'n', dx, dy, dz, 
+					-1.0f, right, ld12, left, ld21, 
+					 1.0f, center, ld22);
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void mult_core_codelet(starpu_data_interface_t *descr, void *_args)
+{
+	mult_common_codelet(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void mult_cublas_codelet(starpu_data_interface_t *descr, void *_args)
+{
+	mult_common_codelet(descr, 1, _args);
+}
+#endif
+
+static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg, float alpha)
+{
+	/* C = A op B */
+
+	float *C 	= (float *)buffers[0].blas.ptr;
+	float *A 	= (float *)buffers[1].blas.ptr;
+	float *B 	= (float *)buffers[2].blas.ptr;
+
+	unsigned dx = buffers[0].blas.nx;
+	unsigned dy = buffers[0].blas.ny;
+
+	unsigned ldA = buffers[1].blas.ld;
+	unsigned ldB = buffers[2].blas.ld;
+	unsigned ldC = buffers[0].blas.ld;
+
+	// TODO check dim ...
+
+	unsigned line;
+
+	switch (s) {
+		case 0:
+			for (line = 0; line < dy; line++)
+			{
+				/* copy line A into C */
+				cblas_saxpy(dx, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
+				/* add line B to C = A */
+				cblas_saxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (line = 0; line < dy; line++)
+			{
+				/* copy line A into C */
+				cublasSaxpy(dx, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
+				/* add line B to C = A */
+				cublasSaxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
+			}
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 0, arg, -1.0f);
+}
+
+void add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 0, arg, 1.0f);
+}
+
+#ifdef USE_CUDA
+void sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 1, arg, -1.0f);
+}
+
+void add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 1, arg, 1.0f);
+}
+#endif
+
+
+static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg, float alpha)
+{
+	/* C +=/-= A */
+
+	float *C 	= (float *)buffers[0].blas.ptr;
+	float *A 	= (float *)buffers[1].blas.ptr;
+
+	unsigned dx = buffers[0].blas.nx;
+	unsigned dy = buffers[0].blas.ny;
+
+	unsigned ldA = buffers[1].blas.ld;
+	unsigned ldC = buffers[0].blas.ld;
+
+	// TODO check dim ...
+	
+	unsigned line;
+
+	switch (s) {
+		case 0:
+			for (line = 0; line < dy; line++)
+			{
+				/* add line A to C */
+				cblas_saxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (line = 0; line < dy; line++)
+			{
+				/* add line A to C */
+				cublasSaxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
+			}
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+
+
+
+void self_add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 0, arg, 1.0f);
+}
+
+void self_sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
+}
+
+#ifdef USE_CUDA
+void self_add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 1, arg, 1.0f);
+}
+
+void self_sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 1, arg, -1.0f);
+}
+#endif

+ 156 - 0
examples/strassen/strassen_models.c

@@ -0,0 +1,156 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "strassen_models.h"
+
+#include <starpu.h>
+
+/*
+ * As a convention, in that file, descr[0] is represented by A,
+ * 				  descr[1] is B ...
+ */
+
+/*
+ *	Number of flops of Gemm 
+ */
+
+//#define USE_PERTURBATION	1
+
+
+#ifdef USE_PERTURBATION
+#define PERTURBATE(a)	((drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
+#else
+#define PERTURBATE(a)	(a)
+#endif
+
+
+static double self_add_sub_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (n*n)/10.0f/4.0f/7.75f;
+
+#ifdef MODEL_DEBUG
+	printf("self add sub cost %e n = %d\n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_self_add_sub_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (n*n)/10.0f/4.0f;
+
+#ifdef MODEL_DEBUG
+	printf("self add sub cost %e n = %d\n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double add_sub_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (1.45f*n*n)/10.0f/2.0f;
+
+#ifdef MODEL_DEBUG
+	printf("add sub cost %e n = %d\n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_add_sub_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (1.45f*n*n)/10.0f/2.0f;
+
+#ifdef MODEL_DEBUG
+	printf("add sub cost %e n = %d\n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+
+static double mult_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/1000.0f/4.11f/0.2588);
+
+#ifdef MODEL_DEBUG
+	printf("mult cost %e n = %d \n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+static double cuda_mult_cost(starpu_buffer_descr *descr)
+{
+	uint32_t n;
+
+	n = starpu_get_blas_nx(descr[0].state);
+
+	double cost = (((double)(n)*n*n)/1000.0f/4.11f);
+
+#ifdef MODEL_DEBUG
+	printf("mult cost %e n = %d \n", cost, n);
+#endif
+
+	return PERTURBATE(cost);
+}
+
+struct starpu_perfmodel_t strassen_model_mult = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = mult_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_mult_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "strassen_model_mult"
+};
+
+struct starpu_perfmodel_t strassen_model_add_sub = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = add_sub_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_add_sub_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "strassen_model_add_sub"
+};
+
+struct starpu_perfmodel_t strassen_model_self_add_sub = {
+	.per_arch = { 
+		[STARPU_CORE_DEFAULT] = { .cost_model = self_add_sub_cost },
+		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_self_add_sub_cost }
+	},
+	.type = HISTORY_BASED,
+	.symbol = "strassen_model_self_add_sub"
+};

+ 22 - 0
examples/strassen/strassen_models.h

@@ -0,0 +1,22 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STRASSEN_MODELS_H__
+#define __STRASSEN_MODELS_H__
+
+#include <starpu.h>
+
+#endif // __STRASSEN_MODELS_H__

+ 188 - 0
examples/strassen/test_strassen.c

@@ -0,0 +1,188 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "strassen.h"
+#include <sys/time.h>
+
+unsigned dim = 4096;
+unsigned reclevel = 4;
+unsigned norandom = 0;
+
+sem_t sem;
+
+float *A;
+float *B;
+float *C;
+
+starpu_data_handle A_state;
+starpu_data_handle B_state;
+starpu_data_handle C_state;
+
+struct timeval start;
+struct timeval end;
+
+
+/* to compute MFlop/s */
+uint64_t flop_cublas = 0;
+uint64_t flop_atlas = 0;
+
+/* to compute MB/s (load/store) */
+uint64_t ls_cublas = 0;
+uint64_t ls_atlas = 0;
+
+/* 
+ * Strassen complexity : n = 2^k matrices, stops at 2^r : recursion = (k-r) levels
+ * 	m = n / 2^rec
+ * 	M(k) = 7^(k-r) 8^r = 7^rec (m^3)
+ * 	A(k) = 4^r (2^r + 5) 7^(k-r) - 6 x 4^k = (m^2)(m+5)*7^rec - 6n^2 
+ *
+ * 	4n^2.807
+ */
+double strassen_complexity(unsigned n, unsigned rec)
+{
+	double mult, add;
+
+	double m = (1.0*n)/(pow(2.0, (double)rec));
+
+	add = ((m*m)*(m+5)*(pow(7.0, (double)rec)) - 6.0*n*n);
+	mult = (m*m*m)*(pow(7.0, (double)rec));
+	
+	//printf("%e adds %e mult\n", add, mult);
+
+	return (add+mult);
+}
+
+/*
+ * That program should compute C = A * B 
+ * 
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+ */
+
+void terminate(void *arg __attribute__ ((unused)))
+{
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	//uint64_t total_flop = flop_cublas + flop_atlas;
+	double total_flop =  strassen_complexity(dim, reclevel);//4.0*pow((double)dim, 2.807);
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "	GFlop : total (%2.2f)\n", (double)total_flop/1000000000.0f);
+	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
+
+	sem_post(&sem);
+}
+
+void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			dim = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-rec") == 0) {
+			char *argptr;
+			reclevel = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-no-random") == 0) {
+			norandom = 1;
+		}
+	}
+}
+
+void init_problem(void)
+{
+	unsigned i,j;
+
+#ifdef USE_FXT
+	fxt_register_thread(0);
+#endif
+
+	A = malloc(dim*dim*sizeof(float));
+	B = malloc(dim*dim*sizeof(float));
+	C = malloc(dim*dim*sizeof(float));
+
+	/* fill the A and B matrices */
+	if (norandom) {
+		for (i=0; i < dim; i++) {
+			for (j=0; j < dim; j++) {
+				A[i+j*dim] = (float)(i);
+			}
+		}
+	
+		for (i=0; i < dim; i++) {
+			for (j=0; j < dim; j++) {
+				B[i+j*dim] = (float)(j);
+			}
+		}
+	} 
+	else {
+		srand(2008);
+		for (j=0; j < dim; j++) {
+			for (i=0; i < dim; i++) {
+				A[i+j*dim] = (float)(drand48());
+			}
+		}
+	
+		for (j=0; j < dim; j++) {
+			for (i=0; i < dim; i++) {
+				B[i+j*dim] = (float)(drand48());
+			}
+		}
+	}
+
+	for (j=0; j < dim; j++) {
+		for (i=0; i < dim; i++) {
+			C[i+j*dim] = (float)(0);
+		}
+	}
+
+	starpu_monitor_blas_data(&A_state, 0, (uintptr_t)A, 
+		dim, dim, dim, sizeof(float));
+	starpu_monitor_blas_data(&B_state, 0, (uintptr_t)B, 
+		dim, dim, dim, sizeof(float));
+	starpu_monitor_blas_data(&C_state, 0, (uintptr_t)C, 
+		dim, dim, dim, sizeof(float));
+
+	gettimeofday(&start, NULL);
+	strassen(A_state, B_state, C_state, terminate, NULL, reclevel);
+}
+
+int main(__attribute__ ((unused)) int argc, 
+	 __attribute__ ((unused)) char **argv)
+{
+
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init();
+
+	sem_init(&sem, 0, 0U);
+
+	init_problem();
+	sem_wait(&sem);
+	sem_destroy(&sem);
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 37 - 0
examples/strassen2/Makefile.in

@@ -0,0 +1,37 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+LDFLAGS += ../common/blas.o
+
+all: strassen
+
+strassen.o: strassen.c
+	$(CC) $(CFLAGS) strassen.c -c -o strassen.o
+
+strassen_kernels.o: strassen_kernels.c
+	$(CC) $(CFLAGS) strassen_kernels.c -c -o strassen_kernels.o
+
+strassen: strassen.o strassen_kernels.o ../common/blas.o
+	$(CC) strassen.o strassen_kernels.o -o strassen $(LDFLAGS) $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f strassen

+ 833 - 0
examples/strassen2/strassen2.c

@@ -0,0 +1,833 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+
+#define MAXDEPS	4
+
+uint64_t current_tag = 1024;
+
+uint64_t used_mem = 0;
+
+/*
+
+Strassen:
+        M1 = (A11 + A22)(B11 + B22)
+        M2 = (A21 + A22)B11
+        M3 = A11(B12 - B22)
+        M4 = A22(B21 - B11)
+        M5 = (A11 + A12)B22
+        M6 = (A21 - A11)(B11 + B12)
+        M7 = (A12 - A22)(B21 + B22)
+
+        C11 = M1 + M4 - M5 + M7
+        C12 = M3 + M5
+        C21 = M2 + M4
+        C22 = M1 - M2 + M3 + M6
+
+	7 recursive calls to the Strassen algorithm (in each Mi computation)
+	10+7 temporary buffers (to compute the terms of Mi = Mia x Mib, and to store Mi)
+
+	complexity:
+		M(n) multiplication complexity
+		A(n) add/sub complexity
+
+		M(n) = (10 + 8) A(n/2) + 7 M(n/2)
+
+	NB: we consider fortran ordering (hence we compute M3t = (B12t - B22t)A11t for instance)
+
+ */
+
+static unsigned size = 2048;
+static unsigned reclevel = 3;
+static unsigned norandom = 0;
+static unsigned pin = 0;
+
+extern void mult_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void self_add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void self_sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+
+#ifdef USE_CUDA
+extern void mult_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void self_add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+extern void self_sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
+#endif
+
+extern void null_codelet(__attribute__((unused)) starpu_data_interface_t *descr,
+                  __attribute__((unused))  void *arg);
+
+
+extern void display_perf(double timing, unsigned size);
+
+struct starpu_perfmodel_t strassen_model_mult = {
+        .type = HISTORY_BASED,
+        .symbol = "strassen_model_mult"
+};
+
+struct starpu_perfmodel_t strassen_model_add = {
+        .type = HISTORY_BASED,
+        .symbol = "strassen_model_add"
+};
+
+struct starpu_perfmodel_t strassen_model_sub = {
+        .type = HISTORY_BASED,
+        .symbol = "strassen_model_sub"
+};
+
+
+struct starpu_perfmodel_t strassen_model_self_add = {
+        .type = HISTORY_BASED,
+        .symbol = "strassen_model_self_add"
+};
+
+struct starpu_perfmodel_t strassen_model_self_sub = {
+        .type = HISTORY_BASED,
+        .symbol = "strassen_model_self_sub"
+};
+
+
+
+struct data_deps_t {
+	unsigned ndeps;
+	starpu_tag_t deps[MAXDEPS];
+};
+
+struct strassen_iter {
+	unsigned reclevel;
+	struct strassen_iter *children[7];
+
+	starpu_data_handle A, B, C;
+
+	/* temporary buffers */
+	/* Mi = Mia * Mib*/
+	starpu_data_handle Mia_data[7];
+	starpu_data_handle Mib_data[7];
+	starpu_data_handle Mi_data[7];
+
+	/* input deps */
+	struct data_deps_t A_deps;
+	struct data_deps_t B_deps;
+
+	/* output deps */
+	struct data_deps_t C_deps;
+};
+
+
+static starpu_filter f = 
+{
+	.filter_func = starpu_block_filter_func,
+	.filter_arg = 2
+};
+
+static starpu_filter f2 =
+{
+	.filter_func = starpu_vertical_block_filter_func,
+	.filter_arg = 2
+};
+
+starpu_data_handle allocate_tmp_matrix(unsigned size, unsigned reclevel)
+{
+	starpu_data_handle *data = malloc(sizeof(starpu_data_handle));
+	float *buffer;
+
+#ifdef USE_CUDA
+        if (pin) {
+                starpu_malloc_pinned_if_possible(&buffer, size*size*sizeof(float));
+        } else
+#endif
+        {
+		posix_memalign((void **)&buffer, 4096, size*size*sizeof(float));
+        }
+
+	assert(buffer);
+
+	used_mem += size*size*sizeof(float);
+
+	memset(buffer, 0, size*size*sizeof(float));
+
+
+	starpu_monitor_blas_data(data, 0, (uintptr_t)buffer, size, size, size, sizeof(float));
+
+	/* we construct a starpu_filter tree of depth reclevel */
+	unsigned rec;
+	for (rec = 0; rec < reclevel; rec++)
+		starpu_map_filters(*data, 2, &f, &f2);
+
+	return *data;
+}
+
+enum operation {
+	ADD,
+	SUB,
+	MULT
+};
+
+static starpu_codelet cl_add = {
+	.where = ANY,
+	.model = &strassen_model_add,
+	.core_func = add_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = add_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+static starpu_codelet cl_sub = {
+	.where = ANY,
+	.model = &strassen_model_sub,
+	.core_func = sub_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = sub_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+static starpu_codelet cl_mult = {
+	.where = ANY,
+	.model = &strassen_model_mult,
+	.core_func = mult_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = mult_cublas_codelet,
+#endif
+	.nbuffers = 3
+};
+
+/* C = A op B */
+struct starpu_task *compute_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A, starpu_data_handle B)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	uint64_t j_tag = current_tag++;
+
+	task->buffers[0].state = C;
+	task->buffers[0].mode = W;
+	task->buffers[1].state = A;
+	task->buffers[1].mode = R;
+	task->buffers[2].state = B;
+	task->buffers[2].mode = R;
+
+	task->callback_func = NULL;
+
+	switch (op) {
+		case ADD:
+			task->cl = &cl_add;
+			break;
+		case SUB:
+			task->cl = &cl_sub;
+			break;
+		case MULT:
+			task->cl = &cl_mult;
+			break;
+		default:
+			assert(0);
+	};
+
+	task->use_tag = 1;
+	task->tag_id = j_tag;
+
+	return task;
+}
+
+static starpu_codelet cl_self_add = {
+	.where = ANY,
+	.model = &strassen_model_self_add,
+	.core_func = self_add_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = self_add_cublas_codelet,
+#endif
+	.nbuffers = 2
+};
+
+static starpu_codelet cl_self_sub = {
+	.where = ANY,
+	.model = &strassen_model_self_sub,
+	.core_func = self_sub_core_codelet,
+#ifdef USE_CUDA
+	.cublas_func = self_sub_cublas_codelet,
+#endif
+	.nbuffers = 2
+};
+
+/* C = C op A */
+struct starpu_task *compute_self_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A)
+{
+	struct starpu_task *task = starpu_task_create();
+	uint64_t j_tag = current_tag++;
+
+	task->buffers[0].state = C;
+	task->buffers[0].mode = RW;
+	task->buffers[1].state = A;
+	task->buffers[1].mode = R;
+
+	task->callback_func = NULL;
+
+	switch (op) {
+		case ADD:
+			task->cl = &cl_self_add;
+			break;
+		case SUB:
+			task->cl = &cl_self_sub;
+			break;
+		default:
+			assert(0);
+	};
+
+	task->use_tag = 1;
+	task->tag_id = j_tag;
+
+	return task;
+}
+
+struct cleanup_arg {
+	unsigned ndeps;
+	uint64_t tags[8];
+	unsigned ndata;
+	starpu_data_handle data[32];
+};
+
+void cleanup_callback(void *_arg)
+{
+	//fprintf(stderr, "cleanup callback\n");
+
+	struct cleanup_arg *arg = _arg;
+
+	unsigned i;
+	for (i = 0; i < arg->ndata; i++)
+		starpu_advise_if_data_is_important(arg->data[i], 0);
+
+	free(arg);
+}
+
+static starpu_codelet cleanup_codelet = {
+	.where = ANY,
+	.model = NULL,
+	.core_func = null_codelet,
+#ifdef USE_CUDA
+	.cublas_func = null_codelet,
+#endif
+	.nbuffers = 0
+};
+
+/* this creates a codelet that will tell StarPU that all specified data are not
+  essential once the tasks corresponding to the task will be performed */
+void create_cleanup_task(struct cleanup_arg *cleanup_arg)
+{
+	struct starpu_task *task = starpu_task_create();
+	uint64_t j_tag = current_tag++;
+
+	task->cl = &cleanup_codelet;
+
+	task->callback_func = cleanup_callback;
+	task->callback_arg = cleanup_arg;
+
+	task->use_tag = 1;
+	task->tag_id = j_tag;
+
+	starpu_tag_declare_deps_array(j_tag, cleanup_arg->ndeps, cleanup_arg->tags);
+
+	starpu_submit_task(task);
+}
+
+void strassen_mult(struct strassen_iter *iter)
+{
+	if (iter->reclevel == 0)
+	{
+		struct starpu_task *task_mult = 
+			compute_add_sub_op(iter->C, MULT, iter->A, iter->B);
+		uint64_t tag_mult = task_mult->tag_id;
+
+		uint64_t deps_array[10];
+		unsigned indexA, indexB;
+		for (indexA = 0; indexA < iter->A_deps.ndeps; indexA++)
+		{
+			deps_array[indexA] = iter->A_deps.deps[indexA];
+		}
+
+		for (indexB = 0; indexB < iter->B_deps.ndeps; indexB++)
+		{
+			deps_array[indexB+indexA] = iter->B_deps.deps[indexB];
+		}
+
+		starpu_tag_declare_deps_array(tag_mult, indexA+indexB, deps_array);
+
+		iter->C_deps.ndeps = 1;
+		iter->C_deps.deps[0] = tag_mult;
+
+		starpu_submit_task(task_mult);
+
+		return;
+	}
+
+        starpu_data_handle A11 = get_sub_data(iter->A, 2, 0, 0);
+        starpu_data_handle A12 = get_sub_data(iter->A, 2, 1, 0);
+        starpu_data_handle A21 = get_sub_data(iter->A, 2, 0, 1);
+        starpu_data_handle A22 = get_sub_data(iter->A, 2, 1, 1);
+
+        starpu_data_handle B11 = get_sub_data(iter->B, 2, 0, 0);
+        starpu_data_handle B12 = get_sub_data(iter->B, 2, 1, 0);
+        starpu_data_handle B21 = get_sub_data(iter->B, 2, 0, 1);
+        starpu_data_handle B22 = get_sub_data(iter->B, 2, 1, 1);
+
+        starpu_data_handle C11 = get_sub_data(iter->C, 2, 0, 0);
+        starpu_data_handle C12 = get_sub_data(iter->C, 2, 1, 0);
+        starpu_data_handle C21 = get_sub_data(iter->C, 2, 0, 1);
+        starpu_data_handle C22 = get_sub_data(iter->C, 2, 1, 1);
+
+	unsigned size = starpu_get_blas_nx(A11);
+
+	/* M1a = (A11 + A22) */
+	iter->Mia_data[0] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_1a = compute_add_sub_op(iter->Mia_data[0], ADD, A11, A22);
+	uint64_t tag_1a = task_1a->tag_id;
+	starpu_tag_declare_deps_array(tag_1a, iter->A_deps.ndeps, iter->A_deps.deps);
+	starpu_submit_task(task_1a);
+
+	/* M1b = (B11 + B22) */
+	iter->Mib_data[0] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_1b = compute_add_sub_op(iter->Mib_data[0], ADD, B11, B22);
+	uint64_t tag_1b = task_1b->tag_id;
+	starpu_tag_declare_deps_array(tag_1b, iter->B_deps.ndeps, iter->B_deps.deps);
+	starpu_submit_task(task_1b);
+
+	/* M2a = (A21 + A22) */
+	iter->Mia_data[1] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_2a = compute_add_sub_op(iter->Mia_data[1], ADD, A21, A22);
+	uint64_t tag_2a = task_2a->tag_id;
+	starpu_tag_declare_deps_array(tag_2a, iter->A_deps.ndeps, iter->A_deps.deps);
+	starpu_submit_task(task_2a);
+
+	/* M3b = (B12 - B22) */
+	iter->Mib_data[2] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_3b = compute_add_sub_op(iter->Mib_data[2], SUB, B12, B22);
+	uint64_t tag_3b = task_3b->tag_id;
+	starpu_tag_declare_deps_array(tag_3b, iter->B_deps.ndeps, iter->B_deps.deps);
+	starpu_submit_task(task_3b);
+	
+	/* M4b = (B21 - B11) */
+	iter->Mib_data[3] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_4b = compute_add_sub_op(iter->Mib_data[3], SUB, B21, B11);
+	uint64_t tag_4b = task_4b->tag_id;
+	starpu_tag_declare_deps_array(tag_4b, iter->B_deps.ndeps, iter->B_deps.deps);
+	starpu_submit_task(task_4b);
+	
+	/* M5a = (A11 + A12) */
+	iter->Mia_data[4] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_5a = compute_add_sub_op(iter->Mia_data[4], ADD, A11, A12);
+	uint64_t tag_5a = task_5a->tag_id;
+	starpu_tag_declare_deps_array(tag_5a, iter->A_deps.ndeps, iter->A_deps.deps);
+	starpu_submit_task(task_5a);
+
+	/* M6a = (A21 - A11) */
+	iter->Mia_data[5] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_6a = compute_add_sub_op(iter->Mia_data[5], SUB, A21, A11);
+	uint64_t tag_6a = task_6a->tag_id;
+	starpu_tag_declare_deps_array(tag_6a, iter->A_deps.ndeps, iter->A_deps.deps);
+	starpu_submit_task(task_6a);
+
+	/* M6b = (B11 + B12) */
+	iter->Mib_data[5] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_6b = compute_add_sub_op(iter->Mib_data[5], SUB, B11, B12);
+	uint64_t tag_6b = task_6b->tag_id;
+	starpu_tag_declare_deps_array(tag_6b, iter->B_deps.ndeps, iter->B_deps.deps);
+	starpu_submit_task(task_6b);
+
+	/* M7a = (A12 - A22) */
+	iter->Mia_data[6] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_7a = compute_add_sub_op(iter->Mia_data[6], SUB, A12, A22);
+	uint64_t tag_7a = task_7a->tag_id;
+	starpu_tag_declare_deps_array(tag_7a, iter->A_deps.ndeps, iter->A_deps.deps);
+	starpu_submit_task(task_7a);
+
+	/* M7b = (B21 + B22) */
+	iter->Mib_data[6] = allocate_tmp_matrix(size, iter->reclevel);
+	struct starpu_task *task_7b = compute_add_sub_op(iter->Mib_data[6], ADD, B21, B22);
+	uint64_t tag_7b = task_7b->tag_id;
+	starpu_tag_declare_deps_array(tag_7b, iter->B_deps.ndeps, iter->B_deps.deps);
+	starpu_submit_task(task_7b);
+
+	iter->Mi_data[0] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[1] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[2] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[3] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[4] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[5] = allocate_tmp_matrix(size, iter->reclevel);
+	iter->Mi_data[6] = allocate_tmp_matrix(size, iter->reclevel);
+
+	/* M1 = M1a * M1b */
+	iter->children[0] = malloc(sizeof(struct strassen_iter));
+	iter->children[0]->reclevel = iter->reclevel - 1;
+	iter->children[0]->A_deps.ndeps = 1;
+	iter->children[0]->A_deps.deps[0] = tag_1a;
+	iter->children[0]->B_deps.ndeps = 1;
+	iter->children[0]->B_deps.deps[0] = tag_1b;
+	iter->children[0]->A = iter->Mia_data[0]; 
+	iter->children[0]->B = iter->Mib_data[0]; 
+	iter->children[0]->C = iter->Mi_data[0];
+	strassen_mult(iter->children[0]);
+
+	/* M2 = M2a * B11 */
+	iter->children[1] = malloc(sizeof(struct strassen_iter));
+	iter->children[1]->reclevel = iter->reclevel - 1;
+	iter->children[1]->A_deps.ndeps = 1;
+	iter->children[1]->A_deps.deps[0] = tag_2a;
+	iter->children[1]->B_deps.ndeps = iter->B_deps.ndeps;
+	memcpy(iter->children[1]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(uint64_t));
+	iter->children[1]->A = iter->Mia_data[1]; 
+	iter->children[1]->B = B11; 
+	iter->children[1]->C = iter->Mi_data[1];
+	strassen_mult(iter->children[1]);
+
+	/* M3 = A11 * M3b */
+	iter->children[2] = malloc(sizeof(struct strassen_iter));
+	iter->children[2]->reclevel = iter->reclevel - 1;
+	iter->children[2]->A_deps.ndeps = iter->B_deps.ndeps;
+	memcpy(iter->children[2]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(uint64_t));
+	iter->children[2]->B_deps.ndeps = 1;
+	iter->children[2]->B_deps.deps[0] = tag_3b;
+	iter->children[2]->A = A11; 
+	iter->children[2]->B = iter->Mib_data[2]; 
+	iter->children[2]->C = iter->Mi_data[2];
+	strassen_mult(iter->children[2]);
+
+	/* M4 = A22 * M4b */
+	iter->children[3] = malloc(sizeof(struct strassen_iter));
+	iter->children[3]->reclevel = iter->reclevel - 1;
+	iter->children[3]->A_deps.ndeps = iter->B_deps.ndeps;
+	memcpy(iter->children[3]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(uint64_t));
+	iter->children[3]->B_deps.ndeps = 1;
+	iter->children[3]->B_deps.deps[0] = tag_4b;
+	iter->children[3]->A = A22; 
+	iter->children[3]->B = iter->Mib_data[3]; 
+	iter->children[3]->C = iter->Mi_data[3];
+	strassen_mult(iter->children[3]);
+
+	/* M5 = M5a * B22 */
+	iter->children[4] = malloc(sizeof(struct strassen_iter));
+	iter->children[4]->reclevel = iter->reclevel - 1;
+	iter->children[4]->A_deps.ndeps = 1;
+	iter->children[4]->A_deps.deps[0] = tag_5a;
+	iter->children[4]->B_deps.ndeps = iter->B_deps.ndeps;
+	memcpy(iter->children[4]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(uint64_t));
+	iter->children[4]->A = iter->Mia_data[4]; 
+	iter->children[4]->B = B22; 
+	iter->children[4]->C = iter->Mi_data[4];
+	strassen_mult(iter->children[4]);
+
+	/* M6 = M6a * M6b */
+	iter->children[5] = malloc(sizeof(struct strassen_iter));
+	iter->children[5]->reclevel = iter->reclevel - 1;
+	iter->children[5]->A_deps.ndeps = 1;
+	iter->children[5]->A_deps.deps[0] = tag_6a;
+	iter->children[5]->B_deps.ndeps = 1;
+	iter->children[5]->B_deps.deps[0] = tag_6b;
+	iter->children[5]->A = iter->Mia_data[5]; 
+	iter->children[5]->B = iter->Mib_data[5]; 
+	iter->children[5]->C = iter->Mi_data[5];
+	strassen_mult(iter->children[5]);
+
+	/* M7 = M7a * M7b */
+	iter->children[6] = malloc(sizeof(struct strassen_iter));
+	iter->children[6]->reclevel = iter->reclevel - 1;
+	iter->children[6]->A_deps.ndeps = 1;
+	iter->children[6]->A_deps.deps[0] = tag_7a;
+	iter->children[6]->B_deps.ndeps = 1;
+	iter->children[6]->B_deps.deps[0] = tag_7b;
+	iter->children[6]->A = iter->Mia_data[6]; 
+	iter->children[6]->B = iter->Mib_data[6]; 
+	iter->children[6]->C = iter->Mi_data[6];
+	strassen_mult(iter->children[6]);
+
+	uint64_t *tag_m1 = iter->children[0]->C_deps.deps;
+	uint64_t *tag_m2 = iter->children[1]->C_deps.deps;
+	uint64_t *tag_m3 = iter->children[2]->C_deps.deps;
+	uint64_t *tag_m4 = iter->children[3]->C_deps.deps;
+	uint64_t *tag_m5 = iter->children[4]->C_deps.deps;
+	uint64_t *tag_m6 = iter->children[5]->C_deps.deps;
+	uint64_t *tag_m7 = iter->children[6]->C_deps.deps;
+
+	/* C11 = M1 + M4 - M5 + M7 */
+	struct starpu_task *task_c11_a = compute_self_add_sub_op(C11, ADD, iter->Mi_data[0]);
+	struct starpu_task *task_c11_b = compute_self_add_sub_op(C11, ADD, iter->Mi_data[3]);
+	struct starpu_task *task_c11_c = compute_self_add_sub_op(C11, SUB, iter->Mi_data[4]);
+	struct starpu_task *task_c11_d = compute_self_add_sub_op(C11, ADD, iter->Mi_data[6]);
+
+	uint64_t tag_c11_a = task_c11_a->tag_id;
+	uint64_t tag_c11_b = task_c11_b->tag_id;
+	uint64_t tag_c11_c = task_c11_c->tag_id;
+	uint64_t tag_c11_d = task_c11_d->tag_id;
+
+	/* C12 = M3 + M5 */
+	struct starpu_task *task_c12_a = compute_self_add_sub_op(C12, ADD, iter->Mi_data[2]);
+	struct starpu_task *task_c12_b = compute_self_add_sub_op(C12, ADD, iter->Mi_data[4]);
+
+	uint64_t tag_c12_a = task_c12_a->tag_id;
+	uint64_t tag_c12_b = task_c12_b->tag_id;
+
+	/* C21 = M2 + M4 */
+	struct starpu_task *task_c21_a = compute_self_add_sub_op(C21, ADD, iter->Mi_data[1]);
+	struct starpu_task *task_c21_b = compute_self_add_sub_op(C21, ADD, iter->Mi_data[3]);
+
+	uint64_t tag_c21_a = task_c21_a->tag_id;
+	uint64_t tag_c21_b = task_c21_b->tag_id;
+
+	/* C22 = M1 - M2 + M3 + M6 */
+	struct starpu_task *task_c22_a = compute_self_add_sub_op(C22, ADD, iter->Mi_data[0]);
+	struct starpu_task *task_c22_b = compute_self_add_sub_op(C22, SUB, iter->Mi_data[1]);
+	struct starpu_task *task_c22_c = compute_self_add_sub_op(C22, ADD, iter->Mi_data[3]);
+	struct starpu_task *task_c22_d = compute_self_add_sub_op(C22, ADD, iter->Mi_data[5]);
+
+	uint64_t tag_c22_a = task_c22_a->tag_id;
+	uint64_t tag_c22_b = task_c22_b->tag_id;
+	uint64_t tag_c22_c = task_c22_c->tag_id;
+	uint64_t tag_c22_d = task_c22_d->tag_id;
+
+	if (iter->reclevel == 1)
+	{
+		starpu_tag_declare_deps(tag_c11_a, 1, tag_m1[0]);
+		starpu_tag_declare_deps(tag_c11_b, 2, tag_m4[0], tag_c11_a);
+		starpu_tag_declare_deps(tag_c11_c, 2, tag_m5[0], tag_c11_b);
+		starpu_tag_declare_deps(tag_c11_d, 2, tag_m7[0], tag_c11_c);
+	
+		starpu_tag_declare_deps(tag_c12_a, 1, tag_m3[0]);
+		starpu_tag_declare_deps(tag_c12_b, 2, tag_m5[0], tag_c12_a);
+
+		starpu_tag_declare_deps(tag_c21_a, 1, tag_m2[0]);
+		starpu_tag_declare_deps(tag_c21_b, 2, tag_m4[0], tag_c21_a);
+	
+		starpu_tag_declare_deps(tag_c22_a, 1, tag_m1[0]);
+		starpu_tag_declare_deps(tag_c22_b, 2, tag_m2[0], tag_c22_a);
+		starpu_tag_declare_deps(tag_c22_c, 2, tag_m3[0], tag_c22_b);
+		starpu_tag_declare_deps(tag_c22_d, 2, tag_m6[0], tag_c22_c);
+	}
+	else
+	{
+		starpu_tag_declare_deps(tag_c11_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);
+		starpu_tag_declare_deps(tag_c11_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c11_a);
+		starpu_tag_declare_deps(tag_c11_c, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c11_b);
+		starpu_tag_declare_deps(tag_c11_d, 5, tag_m7[0], tag_m7[1], tag_m7[2], tag_m7[3], tag_c11_c);
+
+		starpu_tag_declare_deps(tag_c12_a, 4, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3]);
+		starpu_tag_declare_deps(tag_c12_b, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c12_a);
+
+		starpu_tag_declare_deps(tag_c21_a, 4, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3]);
+		starpu_tag_declare_deps(tag_c21_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c21_a);
+
+		starpu_tag_declare_deps(tag_c22_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);
+		starpu_tag_declare_deps(tag_c22_b, 5, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3], tag_c22_a);
+		starpu_tag_declare_deps(tag_c22_c, 5, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3], tag_c22_b);
+		starpu_tag_declare_deps(tag_c22_d, 5, tag_m6[0], tag_m6[1], tag_m6[2], tag_m6[3], tag_c22_c);
+	}
+
+	starpu_submit_task(task_c11_a);
+	starpu_submit_task(task_c11_b);
+	starpu_submit_task(task_c11_c);
+	starpu_submit_task(task_c11_d);
+
+	starpu_submit_task(task_c12_a);
+	starpu_submit_task(task_c12_b);
+
+	starpu_submit_task(task_c21_a);
+	starpu_submit_task(task_c21_b);
+
+	starpu_submit_task(task_c22_a);
+	starpu_submit_task(task_c22_b);
+	starpu_submit_task(task_c22_c);
+	starpu_submit_task(task_c22_d);
+
+	iter->C_deps.ndeps = 4;
+	iter->C_deps.deps[0] = tag_c11_d;
+	iter->C_deps.deps[1] = tag_c12_b;
+	iter->C_deps.deps[2] = tag_c21_b;
+	iter->C_deps.deps[3] = tag_c22_d;
+
+	struct cleanup_arg *clean_struct = malloc(sizeof(struct cleanup_arg));
+
+	clean_struct->ndeps = 4;
+		clean_struct->tags[0] = tag_c11_d;
+		clean_struct->tags[1] = tag_c12_b;
+		clean_struct->tags[2] = tag_c21_b;
+		clean_struct->tags[3] = tag_c22_d;
+	clean_struct->ndata = 17;
+		clean_struct->data[0] = iter->Mia_data[0];
+		clean_struct->data[1] = iter->Mib_data[0];
+		clean_struct->data[2] = iter->Mia_data[1];
+		clean_struct->data[3] = iter->Mib_data[2];
+		clean_struct->data[4] = iter->Mib_data[3];
+		clean_struct->data[5] = iter->Mia_data[4];
+		clean_struct->data[6] = iter->Mia_data[5];
+		clean_struct->data[7] = iter->Mib_data[5];
+		clean_struct->data[8] = iter->Mia_data[6];
+		clean_struct->data[9] = iter->Mib_data[6];
+		clean_struct->data[10] = iter->Mi_data[0];
+		clean_struct->data[11] = iter->Mi_data[1];
+		clean_struct->data[12] = iter->Mi_data[2];
+		clean_struct->data[13] = iter->Mi_data[3];
+		clean_struct->data[14] = iter->Mi_data[4];
+		clean_struct->data[15] = iter->Mi_data[5];
+		clean_struct->data[16] = iter->Mi_data[6];
+		
+	create_cleanup_task(clean_struct);
+}
+
+static void dummy_codelet_func(__attribute__((unused))starpu_data_interface_t *descr,
+				__attribute__((unused))  void *arg)
+{
+}
+
+static starpu_codelet dummy_codelet = {
+	.where = ANY,
+	.model = NULL,
+	.core_func = dummy_codelet_func,
+	#ifdef USE_CUDA
+	.cublas_func = dummy_codelet_func,
+	#endif
+	.nbuffers = 0
+};
+
+static struct starpu_task *dummy_task(uint64_t tag)
+{
+	struct starpu_task *task =starpu_task_create();
+		task->callback_func = NULL;
+                task->cl = &dummy_codelet;
+                task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = tag;
+
+	return task;
+}
+
+void parse_args(int argc, char **argv)
+{
+        int i;
+        for (i = 1; i < argc; i++) {
+                if (strcmp(argv[i], "-size") == 0) {
+                        char *argptr;
+                        size = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-rec") == 0) {
+                        char *argptr;
+                        reclevel = strtol(argv[++i], &argptr, 10);
+                }
+
+                if (strcmp(argv[i], "-no-random") == 0) {
+                        norandom = 1;
+                }
+
+                if (strcmp(argv[i], "-pin") == 0) {
+                        pin = 1;
+                }
+
+        }
+}
+
+int main(int argc, char **argv)
+{
+	starpu_data_handle data_A, data_B, data_C;
+	float *A, *B, *C;
+
+	struct timeval start;
+	struct timeval end;
+
+	parse_args(argc, argv);
+
+	starpu_init();
+
+#ifdef USE_CUDA
+        if (pin) {
+                starpu_malloc_pinned_if_possible(&A, size*size*sizeof(float));
+                starpu_malloc_pinned_if_possible(&B, size*size*sizeof(float));
+                starpu_malloc_pinned_if_possible(&C, size*size*sizeof(float));
+        } else
+#endif
+        {
+                posix_memalign((void **)&A, 4096, size*size*sizeof(float));
+                posix_memalign((void **)&B, 4096, size*size*sizeof(float));
+                posix_memalign((void **)&C, 4096, size*size*sizeof(float));
+        }
+
+	assert(A);
+	assert(B);
+	assert(C);
+
+	used_mem += 3*size*size*sizeof(float);
+
+	memset(A, 0, size*size*sizeof(float));
+	memset(B, 0, size*size*sizeof(float));
+	memset(C, 0, size*size*sizeof(float));
+
+	starpu_monitor_blas_data(&data_A, 0, (uintptr_t)A, size, size, size, sizeof(float));
+	starpu_monitor_blas_data(&data_B, 0, (uintptr_t)B, size, size, size, sizeof(float));
+	starpu_monitor_blas_data(&data_C, 0, (uintptr_t)C, size, size, size, sizeof(float));
+
+	unsigned rec;
+	for (rec = 0; rec < reclevel; rec++)
+	{
+		starpu_map_filters(data_A, 2, &f, &f2);
+		starpu_map_filters(data_B, 2, &f, &f2);
+		starpu_map_filters(data_C, 2, &f, &f2);
+	}
+
+	struct strassen_iter iter;
+		iter.reclevel = reclevel;
+		iter.A = data_A;
+		iter.B = data_B;
+		iter.C = data_C;
+		iter.A_deps.ndeps = 1;
+		iter.A_deps.deps[0] = 42;
+		iter.B_deps.ndeps = 1;
+		iter.B_deps.deps[0] = 42;
+
+	strassen_mult(&iter);
+
+	starpu_tag_declare_deps_array(10, iter.C_deps.ndeps, iter.C_deps.deps);
+
+	fprintf(stderr, "Using %ld MB of memory\n", used_mem/(1024*1024));
+
+	struct starpu_task *task_start = dummy_task(42);
+
+	gettimeofday(&start, NULL);
+	starpu_submit_task(task_start);
+
+	struct starpu_task *task_end = dummy_task(10);
+	
+	task_end->synchronous = 1;
+	starpu_submit_task(task_end);
+
+	gettimeofday(&end, NULL);
+
+	starpu_shutdown();
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	display_perf(timing, size);
+
+	return 0;
+}

+ 242 - 0
examples/strassen2/strassen2_kernels.c

@@ -0,0 +1,242 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+#include <semaphore.h>
+
+#include <starpu_config.h>
+#ifdef USE_CUDA
+#include <cublas.h>
+#endif
+
+#include "../common/blas.h"
+
+#include <starpu.h>
+
+static double cublas_flop = 0.0;
+static double cpus_flop = 0.0;
+
+void display_perf(double timing, unsigned size)
+{
+	double total_flop_n3 = (2.0*size*size*size);
+	double total_flop = cublas_flop + cpus_flop;
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+	fprintf(stderr, "       GFlop : O(n3) -> %2.2f\n",
+			(double)total_flop_n3/1000000000.0f);
+	fprintf(stderr, "       GFlop : real %2.2f\n",
+			(double)total_flop/1000000000.0f);
+	fprintf(stderr, "	CPU : %2.2f (%2.2f%%)\n", (double)cpus_flop/1000000000.0, (100.0*cpus_flop)/(cpus_flop + cublas_flop));
+	fprintf(stderr, "	GPU : %2.2f (%2.2f%%)\n", (double)cublas_flop/1000000000.0, (100.0*cublas_flop)/(cpus_flop + cublas_flop));
+	fprintf(stderr, "       GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
+}
+
+static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg)
+{
+	float *center 	= (float *)buffers[0].blas.ptr;
+	float *left 	= (float *)buffers[1].blas.ptr;
+	float *right 	= (float *)buffers[2].blas.ptr;
+
+	unsigned n = buffers[0].blas.nx;
+
+	unsigned ld21 = buffers[1].blas.ld;
+	unsigned ld12 = buffers[2].blas.ld;
+	unsigned ld22 = buffers[0].blas.ld;
+
+	double flop = 2.0*n*n*n;
+
+	switch (s) {
+		case 0:
+			cpus_flop += flop;
+			SGEMM("N", "N", n, n, n, 1.0f, right, ld21, left, ld12, 0.0f, center, ld22);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublas_flop += flop;
+			cublasSgemm('n', 'n', n, n, n, 1.0f, right, ld12, left, ld21, 0.0f, center, ld22);
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void mult_core_codelet(starpu_data_interface_t *descr, void *_args)
+{
+	mult_common_codelet(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void mult_cublas_codelet(starpu_data_interface_t *descr, void *_args)
+{
+	mult_common_codelet(descr, 1, _args);
+}
+#endif
+
+static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg, float alpha)
+{
+	/* C = A op B */
+
+	float *C 	= (float *)buffers[0].blas.ptr;
+	float *A 	= (float *)buffers[1].blas.ptr;
+	float *B 	= (float *)buffers[2].blas.ptr;
+
+	unsigned n = buffers[0].blas.nx;
+
+	unsigned ldA = buffers[1].blas.ld;
+	unsigned ldB = buffers[2].blas.ld;
+	unsigned ldC = buffers[0].blas.ld;
+
+	double flop = 2.0*n*n;
+
+	// TODO check dim ...
+
+	unsigned line;
+
+	switch (s) {
+		case 0:
+			cpus_flop += flop;
+			for (line = 0; line < n; line++)
+			{
+				/* copy line A into C */
+				SAXPY(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
+				/* add line B to C = A */
+				SAXPY(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublas_flop += flop;
+			for (line = 0; line < n; line++)
+			{
+				/* copy line A into C */
+				cublasSaxpy(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
+				/* add line B to C = A */
+				cublasSaxpy(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
+			}
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 0, arg, -1.0f);
+}
+
+void add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 0, arg, 1.0f);
+}
+
+#ifdef USE_CUDA
+void sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 1, arg, -1.0f);
+}
+
+void add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	add_sub_common_codelet(descr, 1, arg, 1.0f);
+}
+#endif
+
+
+static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __attribute__((unused))  void *arg, float alpha)
+{
+	/* C +=/-= A */
+
+	float *C 	= (float *)buffers[0].blas.ptr;
+	float *A 	= (float *)buffers[1].blas.ptr;
+
+	unsigned n = buffers[0].blas.nx;
+
+	unsigned ldA = buffers[1].blas.ld;
+	unsigned ldC = buffers[0].blas.ld;
+
+	double flop = 1.0*n*n;
+
+	// TODO check dim ...
+	
+	unsigned line;
+
+	switch (s) {
+		case 0:
+			cpus_flop += flop;
+			for (line = 0; line < n; line++)
+			{
+				/* add line A to C */
+				SAXPY(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			cublas_flop += flop;
+			for (line = 0; line < n; line++)
+			{
+				/* add line A to C */
+				cublasSaxpy(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
+			}
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+
+
+
+void self_add_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 0, arg, 1.0f);
+}
+
+void self_sub_core_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
+}
+
+#ifdef USE_CUDA
+void self_add_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 1, arg, 1.0f);
+}
+
+void self_sub_cublas_codelet(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	self_add_sub_common_codelet(descr, 1, arg, -1.0f);
+}
+#endif
+
+/* this codelet does nothing  */
+void null_codelet(__attribute__((unused)) starpu_data_interface_t *descr,
+		  __attribute__((unused))  void *arg)
+{
+}

+ 38 - 0
examples/tag_example/Makefile.in

@@ -0,0 +1,38 @@
+#
+# StarPU
+# Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export PKG_CONFIG_PATH=@STARPUDIR@
+
+LIBS+=$$(pkg-config --libs libstarpu)
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+all: tag_example tag_example2
+
+tag_example.o: tag_example.c
+	$(CC) $(CFLAGS) tag_example.c -c -o tag_example.o
+
+tag_example2.o: tag_example2.c
+	$(CC) $(CFLAGS) tag_example2.c -c -o tag_example2.o
+
+tag_example: tag_example.o
+	$(CC) tag_example.o -o tag_example $(LIBS)
+
+tag_example2: tag_example2.o
+	$(CC) tag_example2.o -o tag_example2 $(LIBS)
+
+clean:
+	@rm -f *.o *.d *.gcno *.gcda
+	@rm -f tag_example tag_example2

+ 207 - 0
examples/tag_example/tag_example.c

@@ -0,0 +1,207 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+#define TAG(i, j, iter)	((uint64_t) ( ((uint64_t)(iter)<<48) |  ((uint64_t)(j)<<24) | (i)) )
+
+sem_t sem;
+starpu_codelet cl;
+
+#define Ni	64
+#define Nj	32
+#define Nk	2
+
+static unsigned ni = Ni, nj = Nj, nk = Nk;
+static unsigned callback_cnt;
+static unsigned iter = 0;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-iter") == 0) {
+		        char *argptr;
+			nk = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-i") == 0) {
+		        char *argptr;
+			ni = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-j") == 0) {
+		        char *argptr;
+			nj = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-iter iter] [-i i] [-j j]\n", argv[0]);
+		}
+	}
+}
+
+void callback_core(void *argcb);
+static void express_deps(unsigned i, unsigned j, unsigned iter);
+
+static void tag_cleanup_grid(unsigned ni, unsigned nj, unsigned iter)
+{
+	unsigned i,j;
+
+	for (j = 0; j < nj; j++)
+	for (i = 0; i < ni; i++)
+	{
+		starpu_tag_remove(TAG(i,j,iter));
+	}
+
+
+} 
+
+static void create_task_grid(unsigned iter)
+{
+	unsigned i, j;
+
+	fprintf(stderr, "start iter %d...\n", iter);
+
+	callback_cnt = (ni*nj);
+
+	/* create non-entry tasks */
+	for (j = 0; j < nj; j++)
+	for (i = 1; i < ni; i++)
+	{
+		/* create a new task */
+		struct starpu_task *task = starpu_task_create();
+		task->callback_func = callback_core;
+		//jb->argcb = &coords[i][j];
+		task->cl = &cl;
+		task->cl_arg = NULL;
+
+		task->use_tag = 1;
+		task->tag_id = TAG(i, j, iter);
+
+		/* express deps : (i,j) depends on (i-1, j-1) & (i-1, j+1) */		
+		express_deps(i, j, iter);
+		
+		starpu_submit_task(task);
+	}
+
+	/* create entry tasks */
+	for (j = 0; j < nj; j++)
+	{
+		/* create a new task */
+		struct starpu_task *task = starpu_task_create();
+		task->callback_func = callback_core;
+		task->cl = &cl;
+		task->cl_arg = NULL;
+
+		task->use_tag = 1;
+		/* this is an entry task */
+		task->tag_id = TAG(0, j, iter);
+
+		starpu_submit_task(task);
+	}
+
+}
+
+
+void callback_core(void *argcb __attribute__ ((unused)))
+{
+	unsigned newcnt = STARPU_ATOMIC_ADD(&callback_cnt, -1);	
+
+	if (newcnt == 0)
+	{
+		
+		iter++;
+		if (iter < nk)
+		{
+			/* cleanup old grids ... */
+			if (iter > 2)
+				tag_cleanup_grid(ni, nj, iter-2);
+
+			/* create a new iteration */
+			create_task_grid(iter);
+		}
+		else {
+			sem_post(&sem);
+		}
+	}
+}
+
+void core_codelet(void *_args __attribute__ ((unused)))
+{
+//	printf("execute task\n");
+}
+
+static void express_deps(unsigned i, unsigned j, unsigned iter)
+{
+	if (j > 0) {
+		/* (i,j-1) exists */
+		if (j < nj - 1)
+		{
+			/* (i,j+1) exists */
+			starpu_tag_declare_deps(TAG(i,j,iter), 2, TAG(i-1,j-1,iter), TAG(i-1,j+1,iter));
+		}
+		else 
+		{
+			/* (i,j+1) does not exist */
+			starpu_tag_declare_deps(TAG(i,j,iter), 1, TAG(i-1,j-1,iter));
+		}
+	}
+	else {
+		/* (i, (j-1) does not exist */
+		if (j < nj - 1)
+		{
+			/* (i,j+1) exists */
+			starpu_tag_declare_deps(TAG(i,j,iter), 1, TAG(i-1,j+1,iter));
+		}
+		else 
+		{
+			/* (i,j+1) does not exist */
+			STARPU_ASSERT(0);
+		}
+	}
+}
+
+int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
+{
+	starpu_init();
+
+	parse_args(argc, argv);
+
+	cl.where = CORE;
+	cl.core_func = core_codelet;
+	cl.cublas_func = core_codelet;
+	cl.nbuffers = 0;
+
+	sem_init(&sem, 0, 0);
+
+	create_task_grid(0);
+
+	sem_wait(&sem);
+
+	starpu_shutdown();
+
+	fprintf(stderr, "TEST DONE ...\n");
+
+	return 0;
+}

+ 149 - 0
examples/tag_example/tag_example2.c

@@ -0,0 +1,149 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+#define TAG(i, iter)	((uint64_t)  (((uint64_t)iter)<<32 | (i)) )
+
+sem_t sem;
+starpu_codelet cl;
+
+#define Ni	64
+#define Nk	2
+
+static unsigned ni = Ni, nk = Nk;
+static unsigned callback_cnt;
+static unsigned iter = 0;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-iter") == 0) {
+		        char *argptr;
+			nk = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-i") == 0) {
+		        char *argptr;
+			ni = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-iter iter] [-i i]\n", argv[0]);
+		}
+	}
+}
+
+void callback_core(void *argcb);
+
+static void tag_cleanup_grid(unsigned ni, unsigned iter)
+{
+	unsigned i;
+
+	for (i = 0; i < ni; i++)
+	{
+		starpu_tag_remove(TAG(i,iter));
+	}
+
+
+} 
+
+static void create_task_grid(unsigned iter)
+{
+	unsigned i;
+
+	fprintf(stderr, "start iter %d ni %d...\n", iter, ni);
+
+	callback_cnt = (ni);
+
+	for (i = 0; i < ni; i++)
+	{
+		/* create a new task */
+		struct starpu_task *task = starpu_task_create();
+		task->callback_func = callback_core;
+		//jb->argcb = &coords[i][j];
+		task->cl = &cl;
+		task->cl_arg = NULL;
+
+		task->use_tag = 1;
+		task->tag_id = TAG(i, iter);
+
+		if (i != 0)
+			starpu_tag_declare_deps(TAG(i,iter), 1, TAG(i-1,iter));
+
+		starpu_submit_task(task);
+	}
+}
+
+
+void callback_core(void *argcb __attribute__ ((unused)))
+{
+	unsigned newcnt = STARPU_ATOMIC_ADD(&callback_cnt, -1);	
+
+	if (newcnt == 0)
+	{
+		
+		iter++;
+		if (iter < nk)
+		{
+			/* cleanup old grids ... */
+			if (iter > 2)
+				tag_cleanup_grid(ni, iter-2);
+
+			/* create a new iteration */
+			create_task_grid(iter);
+		}
+		else {
+			sem_post(&sem);
+		}
+	}
+}
+
+void core_codelet(void *_args __attribute__ ((unused)))
+{
+}
+
+int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
+{
+	starpu_init();
+
+	parse_args(argc, argv);
+
+	cl.core_func = core_codelet;
+	cl.cublas_func = core_codelet;
+	cl.where = CORE;
+	cl.nbuffers = 0;
+
+	sem_init(&sem, 0, 0);
+
+	create_task_grid(0);
+
+	sem_wait(&sem);
+
+	starpu_shutdown();
+
+	fprintf(stderr, "TEST DONE ...\n");
+
+	return 0;
+}

+ 51 - 0
include/starpu-data-filters.h

@@ -0,0 +1,51 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_DATA_FILTERS_H__
+#define __STARPU_DATA_FILTERS_H__
+
+struct starpu_data_state_t;
+
+typedef struct starpu_filter_t {
+	unsigned (*filter_func)(struct starpu_filter_t *, struct starpu_data_state_t *); /* the actual partitionning function */
+	uint32_t filter_arg;
+	void *filter_arg_ptr;
+} starpu_filter;
+
+void starpu_partition_data(struct starpu_data_state_t *initial_data, starpu_filter *f); 
+void starpu_unpartition_data(struct starpu_data_state_t *root_data, uint32_t gathering_node);
+
+/* unsigned list */
+struct starpu_data_state_t *get_sub_data(struct starpu_data_state_t *root_data, unsigned depth, ... );
+
+/* starpu_filter * list */
+void starpu_map_filters(struct starpu_data_state_t *root_data, unsigned nfilters, ...);
+
+/* a few examples of filters */
+
+/* for BCSR */
+unsigned starpu_canonical_block_filter_bcsr(starpu_filter *f, struct starpu_data_state_t *root_data);
+unsigned starpu_vertical_block_filter_func_csr(starpu_filter *f, struct starpu_data_state_t *root_data);
+/* (filters for BLAS interface) */
+unsigned starpu_block_filter_func(starpu_filter *f, struct starpu_data_state_t *root_data);
+unsigned starpu_vertical_block_filter_func(starpu_filter *f, struct starpu_data_state_t *root_data);
+
+/* for vector */
+unsigned starpu_block_filter_func_vector(starpu_filter *f, struct starpu_data_state_t *root_data);
+unsigned starpu_list_filter_func_vector(starpu_filter *f, struct starpu_data_state_t *root_data);
+unsigned starpu_divide_in_2_filter_func_vector(starpu_filter *f, struct starpu_data_state_t *root_data);
+
+#endif

+ 133 - 0
include/starpu-data-interfaces.h

@@ -0,0 +1,133 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_DATA_INTERFACES_H__
+#define __STARPU_DATA_INTERFACES_H__
+
+struct starpu_data_state_t;
+typedef struct starpu_data_state_t * starpu_data_handle;
+
+/* BLAS interface for dense matrices */
+typedef struct starpu_blas_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	uint32_t ny;
+	uint32_t ld;
+	size_t elemsize;
+} starpu_blas_interface_t;
+
+void starpu_monitor_blas_data(starpu_data_handle *handle, uint32_t home_node,
+                        uintptr_t ptr, uint32_t ld, uint32_t nx,
+                        uint32_t ny, size_t elemsize);
+uint32_t starpu_get_blas_nx(starpu_data_handle handle);
+uint32_t starpu_get_blas_ny(starpu_data_handle handle);
+uint32_t starpu_get_blas_local_ld(starpu_data_handle handle);
+uintptr_t starpu_get_blas_local_ptr(starpu_data_handle handle);
+
+/* vector interface for contiguous (non-strided) buffers */
+typedef struct starpu_vector_interface_s {
+	uintptr_t ptr;
+	uint32_t nx;
+	size_t elemsize;
+} starpu_vector_interface_t;
+
+void starpu_monitor_vector_data(starpu_data_handle *handle, uint32_t home_node,
+                        uintptr_t ptr, uint32_t nx, size_t elemsize);
+uint32_t starpu_get_vector_nx(starpu_data_handle handle);
+uintptr_t starpu_get_vector_local_ptr(starpu_data_handle handle);
+
+/* CSR interface for sparse matrices (compressed sparse row representation) */
+typedef struct starpu_csr_interface_s {
+	uint32_t nnz; /* number of non-zero entries */
+	uint32_t nrow; /* number of rows */
+	uintptr_t nzval; /* non-zero values */
+	uint32_t *colind; /* position of non-zero entried on the row */
+	uint32_t *rowptr; /* index (in nzval) of the first entry of the row */
+
+        /* k for k-based indexing (0 or 1 usually) */
+        /* also useful when partitionning the matrix ... */
+        uint32_t firstentry;
+
+	size_t elemsize;
+} starpu_csr_interface_t;
+
+void starpu_monitor_csr_data(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
+		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);
+uint32_t starpu_get_csr_nnz(starpu_data_handle handle);
+uint32_t starpu_get_csr_nrow(starpu_data_handle handle);
+uint32_t starpu_get_csr_firstentry(starpu_data_handle handle);
+uintptr_t starpu_get_csr_local_nzval(starpu_data_handle handle);
+uint32_t *starpu_get_csr_local_colind(starpu_data_handle handle);
+uint32_t *starpu_get_csr_local_rowptr(starpu_data_handle handle);
+
+/* CSC interface for sparse matrices (compressed sparse column representation) */
+typedef struct starpu_csc_interface_s {
+	int nnz; /* number of non-zero entries */
+	int nrow; /* number of rows */
+	float *nzval; /* non-zero values */
+	int *colind; /* position of non-zero entried on the row */
+	int *rowptr; /* index (in nzval) of the first entry of the row */
+
+	/* k for k-based indexing (0 or 1 usually) */
+	/* also useful when partitionning the matrix ... */
+	int firstentry; 
+} starpu_csc_interface_t;
+
+/* BCSR interface for sparse matrices (blocked compressed sparse row
+ * representation) */
+typedef struct starpu_bcsr_interface_s {
+	uint32_t nnz; /* number of non-zero BLOCKS */
+	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
+
+	uintptr_t nzval; /* non-zero values */
+	uint32_t *colind; /* position of non-zero entried on the row */
+//	uint32_t *rowind; /* position of non-zero entried on the col */
+	uint32_t *rowptr; /* index (in nzval) of the first entry of the row */
+
+        /* k for k-based indexing (0 or 1 usually) */
+        /* also useful when partitionning the matrix ... */
+        uint32_t firstentry;
+
+	/* size of the blocks */
+	uint32_t r;
+	uint32_t c;
+
+	size_t elemsize;
+} starpu_bcsr_interface_t;
+
+void starpu_monitor_bcsr_data(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
+		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);
+
+
+uint32_t starpu_get_bcsr_nnz(starpu_data_handle);
+uint32_t starpu_get_bcsr_nrow(starpu_data_handle);
+uint32_t starpu_get_bcsr_firstentry(starpu_data_handle);
+uintptr_t starpu_get_bcsr_local_nzval(starpu_data_handle);
+uint32_t *starpu_get_bcsr_local_colind(starpu_data_handle);
+uint32_t *starpu_get_bcsr_local_rowptr(starpu_data_handle);
+uint32_t starpu_get_bcsr_r(starpu_data_handle);
+uint32_t starpu_get_bcsr_c(starpu_data_handle);
+
+typedef union {
+	starpu_blas_interface_t blas;	/* dense BLAS representation */
+	starpu_vector_interface_t vector; /* continuous vector */
+	starpu_csr_interface_t csr;	/* compressed sparse row */
+	starpu_csc_interface_t csc; 	/* compressed sparse column */
+	starpu_bcsr_interface_t bcsr;	/* blocked compressed sparse row */
+} starpu_data_interface_t;
+
+
+#endif // __STARPU_DATA_INTERFACES_H__

+ 47 - 0
include/starpu-data.h

@@ -0,0 +1,47 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_DATA_H__
+#define __STARPU_DATA_H__
+
+#include <starpu-data-interfaces.h>
+#include <starpu-data-filters.h>
+
+#define NMAXBUFS        8
+
+struct starpu_data_state_t;
+
+typedef enum {
+	R,
+	W,
+	RW
+} starpu_access_mode;
+
+typedef struct starpu_buffer_descr_t {
+	starpu_data_handle state;
+	starpu_access_mode mode;
+} starpu_buffer_descr;
+
+void starpu_unpartition_data(struct starpu_data_state_t *root_data, uint32_t gathering_node);
+void starpu_delete_data(struct starpu_data_state_t *state);
+
+void starpu_advise_if_data_is_important(struct starpu_data_state_t *state, unsigned is_important);
+
+void starpu_sync_data_with_mem(struct starpu_data_state_t *state);
+
+void starpu_malloc_pinned_if_possible(float **A, size_t dim);
+
+#endif // __STARPU_DATA_H__

+ 32 - 0
include/starpu-mutex.h

@@ -0,0 +1,32 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MUTEX_H__
+#define __STARPU_MUTEX_H__
+
+#include <stdint.h>
+
+typedef struct starpu_mutex_t {
+	/* we only have a trivial implementation yet ! */
+	volatile uint32_t taken __attribute__ ((aligned(16)));
+} starpu_mutex;
+
+void init_mutex(starpu_mutex *m);
+void take_mutex(starpu_mutex *m);
+int take_mutex_try(starpu_mutex *m);
+void release_mutex(starpu_mutex *m);
+
+#endif // __STARPU_MUTEX_H__

+ 94 - 0
include/starpu-perfmodel.h

@@ -0,0 +1,94 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_PERFMODEL_H__
+#define __STARPU_PERFMODEL_H__
+
+#include <stdio.h>
+#include <starpu-mutex.h>
+
+struct starpu_htbl32_node_s;
+struct starpu_history_list_t;
+struct starpu_buffer_descr_t;
+
+/* 
+   it is possible that we have multiple versions of the same kind of workers,
+   for instance multiple GPUs or even different CPUs within the same machine
+   so we do not use the archtype enum type directly for performance models
+*/
+
+/* on most system we will consider one or two architectures as all accelerators
+   are likely to be identical */
+#define NARCH_VARIATIONS	2
+
+enum starpu_perf_archtype {
+	STARPU_CORE_DEFAULT = 0,
+	STARPU_CUDA_DEFAULT = 1,
+	STARPU_GORDON_DEFAULT = 2
+};
+
+
+struct starpu_regression_model_t {
+	/* sum of ln(measured) */
+	double sumlny;
+
+	/* sum of ln(size) */
+	double sumlnx;
+	double sumlnx2;
+
+	/* sum of ln(size) ln(measured) */
+	double sumlnxlny;
+
+	/* y = alpha size ^ beta */
+	double alpha;
+	double beta;
+
+	/* y = a size ^b + c */
+	double a, b, c;
+	unsigned valid;
+
+	unsigned nsample;
+};
+
+struct starpu_per_arch_perfmodel_t {
+	double (*cost_model)(struct starpu_buffer_descr_t *t);
+	double alpha;
+	struct starpu_htbl32_node_s *history;
+	struct starpu_history_list_t *list;
+	struct starpu_regression_model_t regression;
+#ifdef MODEL_DEBUG
+	FILE *debug_file;
+#endif
+};
+
+struct starpu_perfmodel_t {
+	/* which model is used for that task ? */
+	enum {PER_ARCH, COMMON, HISTORY_BASED, REGRESSION_BASED} type;
+
+	/* single cost model */
+	double (*cost_model)(struct starpu_buffer_descr_t *);
+
+	/* per-architecture model */
+	struct starpu_per_arch_perfmodel_t per_arch[NARCH_VARIATIONS];
+	
+	const char *symbol;
+	unsigned is_loaded;
+	unsigned benchmarking;
+
+	starpu_mutex model_mutex;
+};
+
+#endif // __STARPU_PERFMODEL_H__

+ 144 - 0
include/starpu-task.h

@@ -0,0 +1,144 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_TASK_H__
+#define __STARPU_TASK_H__
+
+#include <starpu_config.h>
+
+/* this is a randomly choosen value ... */
+#ifndef MAXCUDADEVS
+#define MAXCUDADEVS     4
+#endif
+
+#ifdef USE_CUDA
+#include <cuda.h>
+#endif
+
+#include <starpu-data.h>
+
+#define ANY	(~0)
+#define CORE	((1ULL)<<1)
+#define CUBLAS	((1ULL)<<2)
+#define CUDA	((1ULL)<<3)
+#define SPU	((1ULL)<<4)
+#define GORDON	((1ULL)<<5)
+
+#define MIN_PRIO        (-4)
+#define MAX_PRIO        5
+#define DEFAULT_PRIO	0
+
+typedef uint64_t starpu_tag_t;
+
+/*
+ * A codelet describes the various function 
+ * that may be called from a worker
+ */
+typedef struct starpu_codelet_t {
+	/* where can it be performed ? */
+	uint32_t where;
+
+	/* the different implementations of the codelet */
+	void *cuda_func;
+	void *cublas_func;
+	void *core_func;
+	void *spu_func;
+	uint8_t gordon_func;
+
+	/* how many buffers do the codelet takes as argument ? */
+	unsigned nbuffers;
+
+	struct starpu_perfmodel_t *model;
+} starpu_codelet;
+
+struct starpu_task {
+	struct starpu_codelet_t *cl;
+
+	/* arguments managed by the DSM */
+	struct starpu_buffer_descr_t buffers[NMAXBUFS];
+	starpu_data_interface_t interface[NMAXBUFS];
+
+	/* arguments not managed by the DSM are given as a buffer */
+	void *cl_arg;
+	/* in case the argument buffer has to be uploaded explicitely */
+	size_t cl_arg_size;
+	
+	/* when the task is done, callback_func(callback_arg) is called */
+	void (*callback_func)(void *);
+	void *callback_arg;
+
+	unsigned use_tag;
+	starpu_tag_t tag_id;
+
+	/* options for the task execution */
+	unsigned synchronous; /* if set, a call to push is blocking */
+	int priority; /* MAX_PRIO = most important 
+        		: MIN_PRIO = least important */
+
+	/* this is private to StarPU, do not modify */
+	void *starpu_private;
+};
+
+#ifdef USE_CUDA
+/* CUDA specific codelets */
+typedef struct starpu_cuda_module_s {
+	CUmodule module;
+	char *module_path;
+	unsigned is_loaded[MAXCUDADEVS];
+} starpu_cuda_module_t;
+
+typedef struct starpu_cuda_function_s {
+	struct starpu_cuda_module_s *module;
+	CUfunction function;
+	char *symbol;
+	unsigned is_loaded[MAXCUDADEVS];
+} starpu_cuda_function_t;
+
+typedef struct starpu_cuda_codelet_s {
+	/* which function to execute on the card ? */
+	struct starpu_cuda_function_s *func;
+
+	/* grid and block shapes */
+	unsigned gridx;
+	unsigned gridy;
+	unsigned blockx;
+	unsigned blocky;
+
+	unsigned shmemsize;
+
+	void *stack; /* arguments */
+	size_t stack_size;
+} starpu_cuda_codelet_t;
+
+void starpu_init_cuda_module(struct starpu_cuda_module_s *module, char *path);
+void starpu_load_cuda_module(int devid, struct starpu_cuda_module_s *module);
+void starpu_init_cuda_function(struct starpu_cuda_function_s *func,
+                        struct starpu_cuda_module_s *module,
+                        char *symbol);
+void starpu_load_cuda_function(int devid, struct starpu_cuda_function_s *function);
+#endif // USE_CUDA
+
+/* handle task dependencies: it is possible to associate a task with a unique
+ * "tag" and to express dependencies among tasks by the means of those tags */
+void starpu_tag_remove(starpu_tag_t id);
+void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
+void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
+
+struct starpu_task *starpu_task_create(void);
+int starpu_submit_task(struct starpu_task *task);
+
+
+#endif // __STARPU_TASK_H__

+ 63 - 0
include/starpu-util.h

@@ -0,0 +1,63 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_UTIL_H__
+#define __STARPU_UTIL_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define STARPU_MIN(a,b)	((a)<(b)?(a):(b))
+#define STARPU_MAX(a,b)	((a)<(b)?(b):(a))
+
+#define STARPU_ASSERT(x)	assert(x)
+
+#define STARPU_UNLIKELY(expr)          (__builtin_expect(!!(expr),0))
+#define STARPU_LIKELY(expr)            (__builtin_expect(!!(expr),1))
+
+#define STARPU_ATOMIC_ADD(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
+#define STARPU_ATOMIC_OR(ptr, value)  (__sync_fetch_and_or ((ptr), (value)))
+
+#define STARPU_SUCCESS	0
+#define STARPU_TRYAGAIN	1
+#define STARPU_FATAL	2
+
+static int __attribute__ ((unused)) starpu_get_env_number(const char *str)
+{
+	char *strval;
+
+	strval = getenv(str);
+	if (strval) {
+		/* the env variable was actually set */
+		unsigned val;
+		char *check;
+
+		val = (int)strtol(strval, &check, 10);
+		STARPU_ASSERT(strcmp(check, "\0") == 0);
+
+		//fprintf(stderr, "ENV %s WAS %d\n", str, val);
+		return val;
+	}
+	else {
+		/* there is no such env variable */
+		//fprintf("There was no %s ENV\n", str);
+		return -1;
+	}
+}
+
+#endif // __STARPU_UTIL_H__

+ 36 - 0
include/starpu.h

@@ -0,0 +1,36 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_H__
+#define __STARPU_H__
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <starpu_config.h>
+#include <starpu-util.h>
+#include <starpu-data.h>
+#include <starpu-perfmodel.h>
+#include <starpu-task.h>
+
+/* Initialization method: it must be called prior to any other StarPU call */
+void starpu_init(void);
+
+/* Shutdown method: note that statistics are only generated once StarPU is
+ * shutdown */
+void starpu_shutdown(void);
+
+#endif // __STARPU_H__

+ 0 - 0
include/starpu_config.h.in


Some files were not shown because too many files changed in this diff