Explorar o código

Get rid of the starpu-async branch where all recent changes have been made.

Cédric Augonnet %!s(int64=16) %!d(string=hai) anos
pai
achega
f33fc377f9
Modificáronse 100 ficheiros con 8474 adicións e 2442 borrados
  1. 63 0
      ChangeLog
  2. 13 0
      acinclude.m4
  3. 0 589
      build-aux/depcomp
  4. 0 367
      build-aux/missing
  5. 60 45
      configure.ac
  6. 17 0
      doc/Gantt
  7. 4 0
      doc/Makefile.am
  8. 555 25
      doc/starpu.texi
  9. 94 36
      examples/Makefile.am
  10. 20 0
      examples/audio/Makefile
  11. BIN=BIN
      examples/audio/input.wav
  12. 441 0
      examples/audio/starpu-audio-processing.c
  13. 162 0
      examples/axpy/axpy.c
  14. 19 12
      examples/cholesky/dw_cholesky.c
  15. 27 3
      examples/cholesky/dw_cholesky.h
  16. 377 0
      examples/cholesky/dw_cholesky_grain.c
  17. 23 5
      examples/cholesky/dw_cholesky_kernels.c
  18. 11 6
      examples/cholesky/dw_cholesky_no_stride.c
  19. 142 3
      examples/common/blas.c
  20. 43 0
      examples/common/blas.h
  21. 30 20
      examples/heat/dw_factolu.c
  22. 2 0
      examples/heat/dw_factolu.h
  23. 340 0
      examples/heat/dw_factolu_grain.c
  24. 117 28
      examples/heat/dw_factolu_kernels.c
  25. 20 15
      examples/heat/dw_factolu_tag.c
  26. 13 78
      examples/heat/dw_sparse_cg.c
  27. 104 99
      examples/heat/heat.c
  28. 4 7
      examples/heat/heat.h
  29. 0 45
      examples/heat/heat_display.c
  30. 21 102
      examples/incrementer/incrementer.c
  31. 32 0
      examples/incrementer/incrementer_kernels.cu
  32. 0 114
      examples/incrementer/incrementer_runtime.c
  33. 18 0
      examples/lu/dlu.c
  34. 18 0
      examples/lu/dlu_kernels.c
  35. 2 12
      examples/incrementer/incrementer_runtime_kernels.cu
  36. 39 0
      examples/lu/double.h
  37. 39 0
      examples/lu/float.h
  38. 311 0
      examples/lu/lu_example.c
  39. 18 0
      examples/lu/lu_example_double.c
  40. 18 0
      examples/lu/lu_example_float.c
  41. 18 0
      examples/lu/slu.c
  42. 18 0
      examples/lu/slu_kernels.c
  43. 18 0
      examples/lu/slu_pivot.c
  44. 332 0
      examples/lu/xlu.c
  45. 109 0
      examples/lu/xlu.h
  46. 457 0
      examples/lu/xlu_kernels.c
  47. 42 0
      examples/lu/xlu_kernels.h
  48. 526 0
      examples/lu/xlu_pivot.c
  49. 26 0
      examples/mult/dgemm.c
  50. 29 105
      examples/mult/dw_mult.c
  51. 36 0
      examples/mult/dw_mult.h
  52. 0 256
      examples/mult/dw_mult_no_filters.c
  53. 17 131
      examples/mult/dw_mult_no_stride.c
  54. 17 109
      examples/mult/dw_mult_no_stride_no_tag.c
  55. 8 12
      src/drivers/cuda/comp_cuda.h
  56. 69 0
      examples/mult/sgemm_kernels.c
  57. 261 0
      examples/mult/xgemm.c
  58. 71 0
      examples/mult/xgemm_kernels.c
  59. 10 10
      examples/pastix-wrappers/starpu-blas-wrapper.c
  60. 1 1
      examples/ppm-downscaler/ppm-downscaler.c
  61. 1 0
      examples/ppm-downscaler/yuv-downscaler.c
  62. 2 2
      examples/spmv/dw_block_spmv.c
  63. 18 44
      examples/spmv/dw_spmv.c
  64. 12 29
      examples/cuda/spmv_cuda.cu
  65. 60 0
      examples/starpufft/Makefile.am
  66. 18 0
      examples/starpufft/cuda_kernels.cu
  67. 18 0
      examples/starpufft/cudaf_kernels.cu
  68. 142 0
      examples/starpufft/cudax_kernels.cu
  69. 21 0
      examples/starpufft/cudax_kernels.h
  70. 47 0
      examples/starpufft/double.h
  71. 47 0
      examples/starpufft/float.h
  72. 19 0
      examples/starpufft/starpufft-common.c
  73. 18 0
      examples/starpufft/starpufft.c
  74. 54 0
      examples/starpufft/starpufft.h
  75. 18 0
      examples/starpufft/starpufftf.c
  76. 378 0
      examples/starpufft/starpufftx.c
  77. 640 0
      examples/starpufft/starpufftx1d.c
  78. 708 0
      examples/starpufft/starpufftx2d.c
  79. 18 0
      examples/starpufft/test.c
  80. 18 0
      examples/starpufft/test_threads.c
  81. 18 0
      examples/starpufft/testf.c
  82. 18 0
      examples/starpufft/testf_threads.c
  83. 228 0
      examples/starpufft/testx.c
  84. 100 0
      examples/starpufft/testx_threads.c
  85. 10 10
      examples/strassen/strassen.c
  86. 6 0
      examples/strassen/strassen_kernels.c
  87. 4 0
      examples/strassen/test_strassen.c
  88. 64 51
      examples/strassen2/strassen2.c
  89. 24 0
      examples/strassen2/strassen2_kernels.c
  90. 4 3
      examples/tag_example/tag_example.c
  91. 4 3
      examples/tag_example/tag_example2.c
  92. 2 2
      examples/tag_example/tag_example3.c
  93. 156 0
      examples/tag_example/tag_restartable.c
  94. 8 0
      include/starpu-data-filters.h
  95. 11 0
      include/starpu-data-interfaces.h
  96. 22 1
      include/starpu-data.h
  97. 12 2
      include/starpu-perfmodel.h
  98. 90 58
      include/starpu-task.h
  99. 154 2
      include/starpu-util.h
  100. 0 0
      include/starpu.h

+ 63 - 0
ChangeLog

@@ -0,0 +1,63 @@
+StarPU 0.2.901 aka 0.3-rc1 (svn revision 1236)
+==============================================
+The asynchronous heterogeneous multi-accelerator release
+
+  * Many API changes and code cleanups
+    - Implement starpu_get_worker_id
+    - Implement starpu_get_worker_name
+    - Implement starpu_get_worker_type
+    - Implement starpu_get_worker_count
+    - Implement starpu_display_codelet_stats
+    - Implement starpu_prefetch_data_on_node
+    - Expose the starpu_data_set_wb_mask function
+  * Support nvidia (heterogeneous) multi-GPU
+  * Add the data request mechanism
+    - All data transfers use data requests now
+    - Implement asynchronous data transfers
+    - Implement prefetch mechanism
+    - Chain data requests to support GPU->RAM->GPU transfers 
+  * Make it possible to bypass the scheduler and to assign a task to a specific
+    worker
+  * Support restartable tasks to reinstanciate dependencies task graphs
+  * Improve performance prediction
+    - Model data transfer overhead
+    - One model is created for each accelerator
+  * Support for CUDA's driver API is deprecated
+  * The WORKERS_GPUID and WORKERS_CPUID env. variables make it possible to
+    specify where to bind the workers
+  * Use the hwloc library to detect the actual number of cores
+
+StarPU 0.2.0 (svn revision 1013)
+==============================================
+The Stabilizing-the-Basics release
+
+  * Various API cleanups
+  * Mac OS X is supported now
+  * Add dynamic code loading facilities onto Cell's SPUs
+  * Improve performance analysis/feedback tools
+  * Application can interact with StarPU tasks
+    - The application may access/modify data managed by the DSM
+    - The application may wait for the termination of a (set of) task(s)
+  * An initial documentation is added
+  * More examples are supplied
+
+
+StarPU 0.1.0 (svn revision 794)
+==============================================
+First release.
+
+Status:
+ * Only supports Linux platforms yet
+ * Supported architectures
+   - multicore CPUs
+   - NVIDIA GPUs (with CUDA 2.x)
+   - experimental Cell/BE support
+
+Changes:
+ * Scheduling facilities
+   - run-time selection of the scheduling policy
+   - basic auto-tuning facilities
+ * Software-based DSM
+   - transparent data coherency management
+   - High-level expressive interface
+

+ 13 - 0
acinclude.m4

@@ -12,3 +12,16 @@ AC_DEFUN([STARPU_CHECK_SYNC_BUILTINS], [
     AC_DEFINE(HAVE_SYNC_BUILTINS, 1,
 	      [Define to 1 if the target supports __sync_*_compare_and_swap])
   fi])
+
+dnl Check whether the target supports __sync_fetch_and_add.
+AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
+  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
+		 ac_cv_have_sync_fetch_and_add, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_fetch_and_add(&foo, 1);])],
+			[ac_cv_have_sync_fetch_and_add=yes],
+			[ac_cv_have_sync_fetch_and_add=no])])
+  if test $ac_cv_have_sync_fetch_and_add = yes; then
+    AC_DEFINE(HAVE_SYNC_FETCH_AND_ADD, 1,
+	      [Define to 1 if the target supports __sync_fetch_and_add])
+  fi])

+ 0 - 589
build-aux/depcomp

@@ -1,589 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2007-03-29.01
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007 Free Software
-# Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
-
-case $1 in
-  '')
-     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
-  depmode     Dependency tracking mode.
-  source      Source file read by `PROGRAMS ARGS'.
-  object      Object file output by `PROGRAMS ARGS'.
-  DEPDIR      directory where to store dependencies.
-  depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputing dependencies.
-  libtool     Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "depcomp $scriptversion"
-    exit $?
-    ;;
-esac
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
-  echo "depcomp: Variables source, object and depmode must be set" 1>&2
-  exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
-  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Some modes work just like other modes, but use different flags.  We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write.  Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
-  # HP compiler uses -M and no extra arg.
-  gccflag=-M
-  depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
-   # This is just like dashmstdout with a different argument.
-   dashmflag=-xM
-   depmode=dashmstdout
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff.  Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am.  Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
-  for arg
-  do
-    case $arg in
-    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
-    *)  set fnord "$@" "$arg" ;;
-    esac
-    shift # fnord
-    shift # $arg
-  done
-  "$@"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  mv "$tmpdepfile" "$depfile"
-  ;;
-
-gcc)
-## There are various ways to get dependency output from gcc.  Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-##   up in a subdir.  Having to rename by hand is ugly.
-##   (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).
-## - Using -M directly means running the compiler twice (even worse
-##   than renaming).
-  if test -z "$gccflag"; then
-    gccflag=-MD,
-  fi
-  "$@" -Wp,"$gccflag$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-## The second -e expression handles DOS-style file names with drive letters.
-  sed -e 's/^[^:]*: / /' \
-      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the `deleted header file' problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header).  We avoid this by adding
-## dummy dependencies for each header file.  Too bad gcc doesn't do
-## this for us directly.
-  tr ' ' '
-' < "$tmpdepfile" |
-## Some versions of gcc put a space before the `:'.  On the theory
-## that the space means something, we add a space to the output as
-## well.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-sgi)
-  if test "$libtool" = yes; then
-    "$@" "-Wp,-MDupdate,$tmpdepfile"
-  else
-    "$@" -MDupdate "$tmpdepfile"
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-
-  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
-    echo "$object : \\" > "$depfile"
-
-    # Clip off the initial element (the dependent).  Don't try to be
-    # clever and replace this with sed code, as IRIX sed won't handle
-    # lines with more than a fixed number of characters (4096 in
-    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
-    # the IRIX cc adds comments like `#:fec' to the end of the
-    # dependency line.
-    tr ' ' '
-' < "$tmpdepfile" \
-    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
-    tr '
-' ' ' >> $depfile
-    echo >> $depfile
-
-    # The second pass generates a dummy entry for each header file.
-    tr ' ' '
-' < "$tmpdepfile" \
-   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-   >> $depfile
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-aix)
-  # The C for AIX Compiler uses -M and outputs the dependencies
-  # in a .u file.  In older versions, this file always lives in the
-  # current directory.  Also, the AIX compiler puts `$object:' at the
-  # start of each line; $object doesn't have directory information.
-  # Version 6 uses the directory in both cases.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$base.u
-    tmpdepfile3=$dir.libs/$base.u
-    "$@" -Wc,-M
-  else
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$dir$base.u
-    tmpdepfile3=$dir$base.u
-    "$@" -M
-  fi
-  stat=$?
-
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-    exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    # Each line is of the form `foo.o: dependent.h'.
-    # Do two passes, one to just change these to
-    # `$object: dependent.h' and one to simply `dependent.h:'.
-    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-    # That's a tab and a space in the [].
-    sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-icc)
-  # Intel's C compiler understands `-MD -MF file'.  However on
-  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
-  # ICC 7.0 will fill foo.d with something like
-  #    foo.o: sub/foo.c
-  #    foo.o: sub/foo.h
-  # which is wrong.  We want:
-  #    sub/foo.o: sub/foo.c
-  #    sub/foo.o: sub/foo.h
-  #    sub/foo.c:
-  #    sub/foo.h:
-  # ICC 7.1 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using \ :
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-
-  "$@" -MD -MF "$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each line is of the form `foo.o: dependent.h',
-  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
-  # Do two passes, one to just change these to
-  # `$object: dependent.h' and one to simply `dependent.h:'.
-  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
-    sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp2)
-  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
-  # compilers, which have integrated preprocessors.  The correct option
-  # to use with these is +Maked; it writes dependencies to a file named
-  # 'foo.d', which lands next to the object file, wherever that
-  # happens to be.
-  # Much of this is similar to the tru64 case; see comments there.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir.libs/$base.d
-    "$@" -Wc,+Maked
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    "$@" +Maked
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-     rm -f "$tmpdepfile1" "$tmpdepfile2"
-     exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
-    # Add `dependent.h:' lines.
-    sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
-  else
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile" "$tmpdepfile2"
-  ;;
-
-tru64)
-   # The Tru64 compiler uses -MD to generate dependencies as a side
-   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
-   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-   # dependencies in `foo.d' instead, so we check for that too.
-   # Subdirectories are respected.
-   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-   test "x$dir" = "x$object" && dir=
-   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-
-   if test "$libtool" = yes; then
-      # With Tru64 cc, shared objects can also be used to make a
-      # static library.  This mechanism is used in libtool 1.4 series to
-      # handle both shared and static libraries in a single compilation.
-      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
-      #
-      # With libtool 1.5 this exception was removed, and libtool now
-      # generates 2 separate objects for the 2 libraries.  These two
-      # compilations output dependencies in $dir.libs/$base.o.d and
-      # in $dir$base.o.d.  We have to check for both files, because
-      # one of the two compilations can be disabled.  We should prefer
-      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-      # automatically cleaned when .libs/ is deleted, while ignoring
-      # the former would cause a distcleancheck panic.
-      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
-      tmpdepfile2=$dir$base.o.d          # libtool 1.5
-      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
-      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
-      "$@" -Wc,-MD
-   else
-      tmpdepfile1=$dir$base.o.d
-      tmpdepfile2=$dir$base.d
-      tmpdepfile3=$dir$base.d
-      tmpdepfile4=$dir$base.d
-      "$@" -MD
-   fi
-
-   stat=$?
-   if test $stat -eq 0; then :
-   else
-      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-      exit $stat
-   fi
-
-   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-   do
-     test -f "$tmpdepfile" && break
-   done
-   if test -f "$tmpdepfile"; then
-      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-      # That's a tab and a space in the [].
-      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-   else
-      echo "#dummy" > "$depfile"
-   fi
-   rm -f "$tmpdepfile"
-   ;;
-
-#nosideeffect)
-  # This comment above is used by automake to tell side-effect
-  # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  test -z "$dashmflag" && dashmflag=-M
-  # Require at least two characters before searching for `:'
-  # in the target name.  This is to cope with DOS-style filenames:
-  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
-  "$@" $dashmflag |
-    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  tr ' ' '
-' < "$tmpdepfile" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-dashXmstdout)
-  # This case only exists to satisfy depend.m4.  It is never actually
-  # run, as this mode is specially recognized in the preamble.
-  exit 1
-  ;;
-
-makedepend)
-  "$@" || exit $?
-  # Remove any Libtool call
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-  # X makedepend
-  shift
-  cleared=no
-  for arg in "$@"; do
-    case $cleared in
-    no)
-      set ""; shift
-      cleared=yes ;;
-    esac
-    case "$arg" in
-    -D*|-I*)
-      set fnord "$@" "$arg"; shift ;;
-    # Strip any option that makedepend may not understand.  Remove
-    # the object too, otherwise makedepend will parse it as a source file.
-    -*|$object)
-      ;;
-    *)
-      set fnord "$@" "$arg"; shift ;;
-    esac
-  done
-  obj_suffix="`echo $object | sed 's/^.*\././'`"
-  touch "$tmpdepfile"
-  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  sed '1,2d' "$tmpdepfile" | tr ' ' '
-' | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile" "$tmpdepfile".bak
-  ;;
-
-cpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  "$@" -E |
-    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
-    sed '$ s: \\$::' > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  cat < "$tmpdepfile" >> "$depfile"
-  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvisualcpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o,
-  # because we must use -o when running libtool.
-  "$@" || exit $?
-  IFS=" "
-  for arg
-  do
-    case "$arg" in
-    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-	set fnord "$@"
-	shift
-	shift
-	;;
-    *)
-	set fnord "$@" "$arg"
-	shift
-	shift
-	;;
-    esac
-  done
-  "$@" -E |
-  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
-  echo "	" >> "$depfile"
-  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-none)
-  exec "$@"
-  ;;
-
-*)
-  echo "Unknown depmode $depmode" 1>&2
-  exit 1
-  ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:

+ 0 - 367
build-aux/missing

@@ -1,367 +0,0 @@
-#! /bin/sh
-# Common stub for a few missing GNU programs while installing.
-
-scriptversion=2006-05-10.23
-
-# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
-#   Free Software Foundation, Inc.
-# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-if test $# -eq 0; then
-  echo 1>&2 "Try \`$0 --help' for more information"
-  exit 1
-fi
-
-run=:
-sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
-sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
-
-# In the cases where this matters, `missing' is being run in the
-# srcdir already.
-if test -f configure.ac; then
-  configure_ac=configure.ac
-else
-  configure_ac=configure.in
-fi
-
-msg="missing on your system"
-
-case $1 in
---run)
-  # Try to run requested program, and just exit if it succeeds.
-  run=
-  shift
-  "$@" && exit 0
-  # Exit code 63 means version mismatch.  This often happens
-  # when the user try to use an ancient version of a tool on
-  # a file that requires a minimum version.  In this case we
-  # we should proceed has if the program had been absent, or
-  # if --run hadn't been passed.
-  if test $? = 63; then
-    run=:
-    msg="probably too old"
-  fi
-  ;;
-
-  -h|--h|--he|--hel|--help)
-    echo "\
-$0 [OPTION]... PROGRAM [ARGUMENT]...
-
-Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
-error status if there is no known handling for PROGRAM.
-
-Options:
-  -h, --help      display this help and exit
-  -v, --version   output version information and exit
-  --run           try to run the given command, and emulate it if it fails
-
-Supported PROGRAM values:
-  aclocal      touch file \`aclocal.m4'
-  autoconf     touch file \`configure'
-  autoheader   touch file \`config.h.in'
-  autom4te     touch the output file, or create a stub one
-  automake     touch all \`Makefile.in' files
-  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
-  flex         create \`lex.yy.c', if possible, from existing .c
-  help2man     touch the output file
-  lex          create \`lex.yy.c', if possible, from existing .c
-  makeinfo     touch the output file
-  tar          try tar, gnutar, gtar, then tar without non-portable flags
-  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
-
-Send bug reports to <bug-automake@gnu.org>."
-    exit $?
-    ;;
-
-  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
-    echo "missing $scriptversion (GNU Automake)"
-    exit $?
-    ;;
-
-  -*)
-    echo 1>&2 "$0: Unknown \`$1' option"
-    echo 1>&2 "Try \`$0 --help' for more information"
-    exit 1
-    ;;
-
-esac
-
-# Now exit if we have it, but it failed.  Also exit now if we
-# don't have it and --version was passed (most likely to detect
-# the program).
-case $1 in
-  lex|yacc)
-    # Not GNU programs, they don't have --version.
-    ;;
-
-  tar)
-    if test -n "$run"; then
-       echo 1>&2 "ERROR: \`tar' requires --run"
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       exit 1
-    fi
-    ;;
-
-  *)
-    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
-       # We have it, but it failed.
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       # Could not run --version or --help.  This is probably someone
-       # running `$TOOL --version' or `$TOOL --help' to check whether
-       # $TOOL exists and not knowing $TOOL uses missing.
-       exit 1
-    fi
-    ;;
-esac
-
-# If it does not exist, or fails to run (possibly an outdated version),
-# try to emulate it.
-case $1 in
-  aclocal*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
-         to install the \`Automake' and \`Perl' packages.  Grab them from
-         any GNU archive site."
-    touch aclocal.m4
-    ;;
-
-  autoconf)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`${configure_ac}'.  You might want to install the
-         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
-         archive site."
-    touch configure
-    ;;
-
-  autoheader)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
-         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
-         from any GNU archive site."
-    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
-    test -z "$files" && files="config.h"
-    touch_files=
-    for f in $files; do
-      case $f in
-      *:*) touch_files="$touch_files "`echo "$f" |
-				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
-      *) touch_files="$touch_files $f.in";;
-      esac
-    done
-    touch $touch_files
-    ;;
-
-  automake*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
-         You might want to install the \`Automake' and \`Perl' packages.
-         Grab them from any GNU archive site."
-    find . -type f -name Makefile.am -print |
-	   sed 's/\.am$/.in/' |
-	   while read f; do touch "$f"; done
-    ;;
-
-  autom4te)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, but is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.
-         You can get \`$1' as part of \`Autoconf' from any GNU
-         archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo "#! /bin/sh"
-	echo "# Created by GNU Automake missing as a replacement of"
-	echo "#  $ $@"
-	echo "exit 0"
-	chmod +x $file
-	exit 1
-    fi
-    ;;
-
-  bison|yacc)
-    echo 1>&2 "\
-WARNING: \`$1' $msg.  You should only need it if
-         you modified a \`.y' file.  You may need the \`Bison' package
-         in order for those modifications to take effect.  You can get
-         \`Bison' from any GNU archive site."
-    rm -f y.tab.c y.tab.h
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.y)
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.c
-	    fi
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.h
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f y.tab.h; then
-	echo >y.tab.h
-    fi
-    if test ! -f y.tab.c; then
-	echo 'main() { return 0; }' >y.tab.c
-    fi
-    ;;
-
-  lex|flex)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.l' file.  You may need the \`Flex' package
-         in order for those modifications to take effect.  You can get
-         \`Flex' from any GNU archive site."
-    rm -f lex.yy.c
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.l)
-	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" lex.yy.c
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f lex.yy.c; then
-	echo 'main() { return 0; }' >lex.yy.c
-    fi
-    ;;
-
-  help2man)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-	 you modified a dependency of a manual page.  You may need the
-	 \`Help2man' package in order for those modifications to take
-	 effect.  You can get \`Help2man' from any GNU archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo ".ab help2man is required to generate this page"
-	exit 1
-    fi
-    ;;
-
-  makeinfo)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.texi' or \`.texinfo' file, or any other file
-         indirectly affecting the aspect of the manual.  The spurious
-         call might also be the consequence of using a buggy \`make' (AIX,
-         DU, IRIX).  You might want to install the \`Texinfo' package or
-         the \`GNU make' package.  Grab either from any GNU archive site."
-    # The file to touch is that specified with -o ...
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -z "$file"; then
-      # ... or it is the one specified with @setfilename ...
-      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
-      file=`sed -n '
-	/^@setfilename/{
-	  s/.* \([^ ]*\) *$/\1/
-	  p
-	  q
-	}' $infile`
-      # ... or it is derived from the source name (dir/f.texi becomes f.info)
-      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
-    fi
-    # If the file does not exist, the user really needs makeinfo;
-    # let's fail without touching anything.
-    test -f $file || exit 1
-    touch $file
-    ;;
-
-  tar)
-    shift
-
-    # We have already tried tar in the generic part.
-    # Look for gnutar/gtar before invocation to avoid ugly error
-    # messages.
-    if (gnutar --version > /dev/null 2>&1); then
-       gnutar "$@" && exit 0
-    fi
-    if (gtar --version > /dev/null 2>&1); then
-       gtar "$@" && exit 0
-    fi
-    firstarg="$1"
-    if shift; then
-	case $firstarg in
-	*o*)
-	    firstarg=`echo "$firstarg" | sed s/o//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-	case $firstarg in
-	*h*)
-	    firstarg=`echo "$firstarg" | sed s/h//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-    fi
-
-    echo 1>&2 "\
-WARNING: I can't seem to be able to run \`tar' with the given arguments.
-         You may want to install GNU tar or Free paxutils, or check the
-         command line arguments."
-    exit 1
-    ;;
-
-  *)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, and is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.  Check the \`README' file,
-         it often tells you about the needed prerequisites for installing
-         this package.  You may also peek at any GNU archive site, in case
-         some other package would contain this missing \`$1' program."
-    exit 1
-    ;;
-esac
-
-exit 0
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:

+ 60 - 45
configure.ac

@@ -14,12 +14,14 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
-AC_INIT([StarPU],0.2, [http://runtime.bordeaux.inria.fr/StarPU/], starpu)
+AC_INIT([StarPU],0.2.901, [http://runtime.bordeaux.inria.fr/StarPU/], starpu)
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 AC_CANONICAL_SYSTEM
 AM_INIT_AUTOMAKE([-Wall -Werror foreign])
 
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
 AC_PREREQ(2.60)
 
 AC_PROG_CC
@@ -36,6 +38,8 @@ AC_PROG_LN_S
 
 AC_HEADER_STDC
 
+AC_C_RESTRICT
+
 # on Darwin, GCC targets i386 by default, so we don't have atomic ops
 case "$target" in
 i386-*darwin*) CFLAGS+=" -march=i686 " ;;
@@ -57,29 +61,19 @@ if test x$have_pthread_spin_lock = xyes; then
 fi
 
 # yes, that's non portable, but it's still better than sched_setaffinity
-AC_CHECK_FUNC(pthread_setaffinity_np,
-	[], [AC_DEFINE(DONTBIND, [1], [Threads are not bound on a CPU])])
-
-# There is no posix_memalign on Mac OS X
-AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
-if test x$have_posix_memalign = xyes; then
-	AC_DEFINE(HAVE_POSIX_MEMALIGN,[],[posix_memalign is available])
-fi
+AC_CHECK_FUNCS(pthread_setaffinity_np)
 
+# There is no posix_memalign on Mac OS X, only memalign
+AC_CHECK_FUNCS([posix_memalign memalign])
 
-AC_CHECK_HEADER([malloc.h],[have_malloc_h_header=yes],[have_malloc_h_header=no])
-if test x$have_malloc_h_header = xyes; then
-	AC_DEFINE(HAVE_MALLOC_H_HEADER,[],[malloc.h header is available])
-fi
-
-AC_CHECK_FUNC(memalign, have_memalign=yes, have_memalign=no)
-if test x$have_memalign = xyes; then
-	AC_DEFINE(HAVE_MEMALIGN,[],[memalign is available])
-fi
+AC_CHECK_HEADERS([malloc.h])
 
 # This defines HAVE_SYNC_BUILTINS
 STARPU_CHECK_SYNC_BUILTINS
 
+# This defines HAVE_SYNC_FETCH_AND_ADD
+STARPU_CHECK_SYNC_FETCH_AND_ADD
+
 CPPFLAGS="${CPPFLAGS} -D_GNU_SOURCE "
 
 ###############################################################################
@@ -127,14 +121,27 @@ if test x$enable_cuda = xyes -o x$enable_cuda = xmaybe; then
 	if test -d "$cuda_dir/include/"; then
 		CPPFLAGS="${CPPFLAGS} -I$cuda_dir/include/ "
 	fi
-	if test -d "$cuda_dir/lib/"; then
-		LDFLAGS="${LDFLAGS} -L$cuda_dir/lib/ "
-	fi
 
 	# do we have a valid CUDA setup ?
 	have_valid_cuda=yes
 	AC_CHECK_HEADER([cuda.h],,[have_valid_cuda=no])
-	AC_SEARCH_LIBS([cuInit],[cuda],,[have_valid_cuda=no])
+
+	# we are looking for the proper option in LDFLAGS, so we save the
+	# current value of LDFLAGS so that we can add new things in it and
+	# restore it in case it's not working.
+	SAVED_LDFLAGS="${LDFLAGS}"
+
+	found_cudalib=no
+	if test -d "$cuda_dir/lib/"; then
+		LDFLAGS="${SAVED_LDFLAGS} -L$cuda_dir/lib/ "
+		AC_SEARCH_LIBS([cuInit],[cuda],[found_cudalib=yes],[found_cudalib=no])
+	fi
+
+	if test x$found_cudalib=xno -o -d "$cuda_dir/lib64/"; then
+		LDFLAGS="${SAVED_LDFLAGS} -L$cuda_dir/lib64/ "
+	fi
+
+	AC_SEARCH_LIBS([cuInit],[cuda],[],[have_valid_cuda=no])
 
 	# we also check that CUBLAS is available
 	AC_SEARCH_LIBS([cublasInit],[cublas],,[have_valid_cuda=no])
@@ -229,6 +236,18 @@ fi
 #                                                                             #
 ###############################################################################
 
+AC_MSG_CHECKING(whether debug mode should be enabled)
+AC_ARG_ENABLE(debug, [AS_HELP_STRING([--enable-debug], [enable debug mode])],
+			enable_debug=$enableval, enable_debug=no)
+AC_MSG_RESULT($enable_debug)
+
+if test x$enable_debug = xyes; then
+	CFLAGS=" -O0 "
+else
+	CFLAGS=" -O3 "
+fi
+CFLAGS+=" -gdwarf-2 -g3 "
+
 AC_MSG_CHECKING(whether debug messages should be displayed)
 AC_ARG_ENABLE(verbose, [AS_HELP_STRING([--enable-verbose],
 			[display verbose debug messages])],
@@ -296,18 +315,6 @@ if test x$use_fxt = xyes; then
 	AC_CHECK_LIB(fxt, fut_setup,,AC_MSG_ERROR([cannot find fxt lib]))
 	AC_CHECK_HEADER([fxt/fxt.h],,AC_MSG_ERROR([cannot find headers for fxt]))
 	AC_CHECK_HEADER([fxt/fut.h],,AC_MSG_ERROR([cannot find headers for fxt]))
-
-	# In case FxT traces are generated, we may use our (poor) hand-made gtk
-	# tool to visualize traces
-
-	PKG_PROG_PKG_CONFIG
-	PKG_CHECK_MODULES([GTK], [gtk+-2.0], enable_gtk=yes, enable_gtk=no) 
-	AC_SUBST(USE_GTK, $enable_gtk)
-	if test x$enable_gtk = xyes; then
-		AC_SUBST(GTK_CFLAGS)
-		AC_SUBST(GTK_LIBS)
-		AC_DEFINE(USE_GTK, [1], [enable GTK])
-	fi
 fi
 
 AC_MSG_CHECKING(whether performance debugging should be enabled)
@@ -359,16 +366,6 @@ if test x$enable_priority = xno; then
 	AC_DEFINE(NO_PRIO, [1], [Disable priorities])
 fi
 
-AC_MSG_CHECKING(whether data RW-lock should be used)
-AC_ARG_ENABLE(data-rw-lock, [AS_HELP_STRING([--enable-data-rw-lock],
-			[use data RW-locks])],
-			enable_data_rw_lock=$enableval, enable_data_rw_lock=no)
-AC_MSG_RESULT($enable_data_rw_lock)
-if test x$enable_data_rw_lock = xno; then
-	AC_DEFINE(NO_DATA_RW_LOCK, [1], [data RW-lock are disabled])
-fi
-
-
 AC_MSG_CHECKING(whether allocation cache should be used)
 AC_ARG_ENABLE(allocation-cache, [AS_HELP_STRING([--enable-allocation-cache],
 			[enable data allocation cache])],
@@ -396,8 +393,8 @@ AC_ARG_ENABLE(sync-clock, [AS_HELP_STRING([--enable-sync-clock], [Use monotonic
 		enable_sync_clock=$enableval, enable_sync_clock=no)
 AC_MSG_CHECKING(whether using a synchronous clock)
 AC_MSG_RESULT($enable_sync_clock)
-AC_DEFINE(USE_SYNC_CLOCK, [1], [Use a mononotic clock])
 if test x$enable_sync_clock = xyes; then
+	AC_DEFINE(USE_SYNC_CLOCK, [1], [Use a mononotic clock])
 	AC_CHECK_LIB(rt, clock_gettime,,AC_MSG_ERROR([cannot find clock_gettime]))
 fi
 
@@ -515,6 +512,23 @@ AC_MSG_CHECKING(which BLAS lib should be used)
 AC_MSG_RESULT($blas_lib)
 AC_SUBST(BLAS_LIB,$blas_lib)
 
+PKG_CHECK_MODULES([FFTW],  [fftw3],  [
+  AC_DEFINE([HAVE_FFTW], [1], [Define to 1 if you have the libfftw3 library.])
+  AC_SUBST([HAVE_FFTW], [1])
+], [:])
+PKG_CHECK_MODULES([FFTWF], [fftw3f], [
+  AC_DEFINE([HAVE_FFTWF], [1], [Define to 1 if you have the libfftw3f library.])
+  AC_SUBST([HAVE_FFTWF], [1])
+], [:])
+PKG_CHECK_MODULES([FFTWL], [fftw3l], [
+  AC_DEFINE([HAVE_FFTWL], [1], [Define to 1 if you have the libfftw3l library.])
+  AC_SUBST([HAVE_FFTWFL], [1])
+], [:])
+PKG_CHECK_MODULES([HWLOC], [hwloc], [
+  AC_DEFINE([HAVE_HWLOC], [1], [Define to 1 if you have the hwloc library.])
+  AC_SUBST([HAVE_HWLOC], [1])
+], [:])
+
 AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
 
 AC_OUTPUT([
@@ -523,6 +537,7 @@ AC_OUTPUT([
 	tools/Makefile
 	libstarpu.pc
 	examples/Makefile
+	examples/starpufft/Makefile
 	tests/Makefile
 	doc/Makefile
 ])

+ 17 - 0
doc/Gantt

@@ -0,0 +1,17 @@
+- Configure StarPU to use FxT
+
+	./configure --with-fxt=/home/gonnet/hannibal/Libs/FxT/FxT/target/
+
+- Execute applications as usual
+
+- If the application was properly terminated (ie. starpu_shutdown was called),
+  there should be a file named "/tmp/prof_file_user_".
+
+- Call tools/fxt-tool on that file
+	./tools/fxt-tool -i /tmp/prof_file_user_yourlogin
+
+- Some files should have been created in the current directory
+	- paje.trace : A Gantt diagram of the execution
+		$ vite paje.trace
+	- dag.dot : A graphviz graph of the task dependencies (according to tags)
+		$ dot -Tpdf dag.dot -o dag.pdf

+ 4 - 0
doc/Makefile.am

@@ -15,3 +15,7 @@
 #
 
 info_TEXINFOS = starpu.texi
+
+MAINTAINERCLEANFILES = starpu.pdf
+
+EXTRA_DIST = starpu.pdf

+ 555 - 25
doc/starpu.texi

@@ -31,11 +31,13 @@ This manual documents the usage of StarPU
 @comment  better formatting.
 @comment
 @menu
-* Introduction::       A basic introduction to using StarPU.
-* Installing StarPU::  How to configure, build and install StarPU
-* StarPU API::         The API to use StarPU
-* Basic Examples::     Basic examples of the use of StarPU
-* Advanced Topics::    Advanced use of StarPU
+* Introduction::          A basic introduction to using StarPU.
+* Installing StarPU::     How to configure, build and install StarPU.
+* Configuration options:: Configurations options
+* Environment variables:: Environment variables used by StarPU.
+* StarPU API::            The API to use StarPU.
+* Basic Examples::        Basic examples of the use of StarPU.
+* Advanced Topics::       Advanced use of StarPU.
 @end menu
 
 @c ---------------------------------------------------------------------
@@ -114,6 +116,14 @@ by expressing dependencies between tags.
 @c DSM
 @subsection StarPU Data Management Library
 
+Because StarPU schedules tasks at runtime, data transfers have to be
+done automatically and ``just-in-time'' between processing units,
+relieving the application programmer from explicit data transfers.
+Moreover, to avoid unnecessary transfers, StarPU keeps data
+where it was last needed, even if was modified there, and it
+allows multiple copies of the same data to reside at the same time on
+several processing units as long as it is not modified. 
+
 @c ---------------------------------------------------------------------
 @c Installing StarPU
 @c ---------------------------------------------------------------------
@@ -134,7 +144,7 @@ are using the source code from the svn repository, you first need to generate
 the configure scripts and the Makefiles.
 
 @example
-$ autoreconf -i
+$ autoreconf -vfi
 @end example
 
 @subsection Configuring StarPU
@@ -143,7 +153,7 @@ $ autoreconf -i
 $ ./configure
 @end example
 
-@c TODO enumerate the list of interesting options
+@c TODO enumerate the list of interesting options: refer to a specific section
 
 @section Building and Installing StarPU
 
@@ -168,7 +178,7 @@ In order to install StarPU at the location that was specified during
 configuration:
 
 @example
-# make install
+$ make install
 @end example
 
 @subsection pkg-config configuration
@@ -196,6 +206,139 @@ $ pkg-config --libs libstarpu    # options for the linker
 @end example
 
 @c ---------------------------------------------------------------------
+@c Configuration options
+@c ---------------------------------------------------------------------
+
+@node Configuration options
+@chapter Configuration options
+
+TODO
+
+@c ---------------------------------------------------------------------
+@c Environment variables
+@c ---------------------------------------------------------------------
+
+@node Environment variables
+@chapter Environment variables
+
+@menu
+* Workers::     Configuring workers
+* Scheduling::  Configuring the Scheduling engine
+* Misc::        Miscellaneous and debug
+@end menu
+
+TODO, explicit configuration (passed to starpu_init) overrides env variables.
+
+@node Workers
+@section Configuring workers
+
+@menu
+* NCPUS     :: Number of CPU workers
+* NCUDA     :: Number of CUDA workers
+* NGORDON   :: Number of SPU workers (Cell)
+* WORKERS_CPUID  :: Bind workers to specific CPUs
+* WORKERS_GPUID  :: Select specific CUDA devices
+@end menu
+
+@node NCPUS
+@subsection @code{NCPUS} -- Number of CPU workers
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node NCUDA
+@subsection @code{NCUDA} -- Number of CUDA workers
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node NGORDON
+@subsection @code{NGORDON} -- Number of SPU workers (Cell)
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+
+@node WORKERS_CPUID
+@subsection @code{WORKERS_CPUID} -- Bind workers to specific CPUs
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node WORKERS_GPUID
+@subsection @code{WORKERS_GPUID} -- Select specific CUDA devices
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node Scheduling
+@section Configuring the Scheduling engine
+
+@menu
+* SCHED     :: Scheduling policy
+* CALIBRATE :: Calibrate performance models
+* PREFETCH  :: Use data prefetch
+@end menu
+
+@node SCHED
+@subsection @code{SCHED} -- Scheduling policy
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node CALIBRATE
+@subsection @code{CALIBRATE} -- Calibrate performance models
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node PREFETCH
+@subsection @code{PREFETCH} -- Use data prefetch
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@node Misc
+@section Miscellaneous and debug
+
+@menu
+* LOGFILENAME  :: Select debug file name
+@end menu
+
+@node LOGFILENAME
+@subsection @code{LOGFILENAME} -- Select debug file name
+@table @asis
+
+@item @emph{Description}:
+TODO
+
+@end table
+
+@c ---------------------------------------------------------------------
 @c StarPU API
 @c ---------------------------------------------------------------------
 
@@ -204,6 +347,7 @@ $ pkg-config --libs libstarpu    # options for the linker
 
 @menu
 * Initialization and Termination::       Initialization and Termination methods
+* Workers' Properties::                  Methods to enumerate workers' properties
 * Data Library::                         Methods to manipulate data
 * Codelets and Tasks::                   Methods to construct tasks
 * Tags::                                 Task dependencies
@@ -227,9 +371,12 @@ This is StarPU initialization method, which must be called prior to any other
 StarPU call.  It is possible to specify StarPU's configuration (eg. scheduling
 policy, number of cores, ...) by passing a non-null argument. Default
 configuration is used if the passed argument is @code{NULL}.
+@item @emph{Return value}:
+Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
+indicates that no worker was available (so that StarPU was not be initialized).
 
 @item @emph{Prototype}:
-@code{void starpu_init(struct starpu_conf *conf);}
+@code{int starpu_init(struct starpu_conf *conf);}
 
 @end table
 
@@ -238,9 +385,35 @@ configuration is used if the passed argument is @code{NULL}.
 
 @table @asis
 @item @emph{Description}:
-TODO
-@item @emph{Definition}:
-TODO
+This structure is passed to the @code{starpu_init} function in order configure
+StarPU. When the default value is used, StarPU automatically select the number
+of processing units and takes the default scheduling policy. This parameters
+overwrite the equivalent environnement variables. 
+
+@item @emph{Fields}:
+@table @asis 
+@item @code{sched_policy} (default = NULL):
+This is the name of the scheduling policy. This can also be specified with the
+@code{SCHED} environment variable.
+
+@item @code{ncpus} (default = -1):
+This is the maximum number of CPU cores that StarPU can use. This can also be
+specified with the @code{NCPUS} environment variable.
+
+@item @code{ncuda} (default = -1):
+This is the maximum number of CUDA devices that StarPU can use. This can also be
+specified with the @code{NCUDA} environment variable.
+
+@item @code{nspus} (default = -1):
+This is the maximum number of Cell SPUs that StarPU can use. This can also be
+specified with the @code{NGORDON} environment variable.
+
+@item @code{calibrate} (default = 0):
+If this flag is set, StarPU will calibrate the performance models when
+executing tasks. This can also be specified with the @code{CALIBRATE}
+environment variable.
+@end table
+
 @end table
 
 
@@ -259,6 +432,78 @@ garanteed to be available until this method has been called.
 
 @end table
 
+@node Workers' Properties
+@section Workers' Properties
+
+@menu
+* starpu_get_worker_count:: Get the number of processing units
+* starpu_get_worker_id::    Get the identifier of the current worker
+* starpu_get_worker_type::  Get the type of processing unit associated to a worker
+* starpu_get_worker_name::  Get the name of a worker
+@end menu
+
+@node starpu_get_worker_count
+@subsection @code{starpu_get_worker_count} -- Get the number of processing units
+@table @asis
+
+@item @emph{Description}:
+This function returns the number of workers (ie. processing units executing
+StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}. 
+
+@item @emph{Prototype}:
+@code{unsigned starpu_get_worker_count(void);}
+
+@end table
+
+
+@node starpu_get_worker_id
+@subsection @code{starpu_get_worker_id} -- Get the identifier of the current worker
+@table @asis
+
+@item @emph{Description}:
+This function returns the identifier of the worker associated to the calling
+thread. The returned value is either -1 if the current context is not a StarPU
+worker (ie. when called from the application outside a task or a callback), or
+an integer between 0 and @code{starpu_get_worker_count() - 1}.
+
+@item @emph{Prototype}:
+@code{int starpu_get_worker_count(void);}
+
+@end table
+
+@node starpu_get_worker_type
+@subsection @code{starpu_get_worker_type} -- Get the type of processing unit associated to a worker
+@table @asis
+
+@item @emph{Description}:
+This function returns the type of worker associated to an identifier (as
+returned by the @code{starpu_get_worker_id} function). The returned value
+indicates the architecture of the worker: @code{STARPU_CORE_WORKER} for a CPU
+core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
+@code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
+identifier is unspecified.
+
+@item @emph{Prototype}:
+@code{enum starpu_archtype starpu_get_worker_type(int id);}
+
+@end table
+
+@node starpu_get_worker_name
+@subsection @code{starpu_get_worker_name} -- Get the name of a worker
+@table @asis
+
+@item @emph{Description}:
+StarPU associates a unique human readable string to each processing unit. This
+function copies at most the @code{maxlen} first bytes of the unique string
+associated to a worker identified by its identifier @code{id} into the
+@code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
+is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
+function on an invalid identifier results in an unspecified behaviour.
+
+@item @emph{Prototype}:
+@code{void starpu_get_worker_name(int id, char *dst, size_t maxlen);}
+
+@end table
 
 @node Data Library
 @section Data Library
@@ -275,22 +520,238 @@ garanteed to be available until this method has been called.
 @section Codelets and Tasks
 
 @menu
+* struct starpu_codelet::         StarPU codelet structure
+* struct starpu_task::            StarPU task structure
+* starpu_task_init::              Initialize a Task
 * starpu_task_create::            Allocate and Initialize a Task
+* starpu_task_destroy::           Destroy a dynamically allocated Task
+* starpu_submit_task::            Submit a Task
+* starpu_wait_task::              Wait for the termination of a Task
+* starpu_wait_all_tasks::	  Wait for the termination of all Tasks
 @end menu
 
 
 @c struct starpu_task
 @c struct starpu_codelet
 
+@node struct starpu_codelet
+@subsection @code{struct starpu_codelet} -- StarPU codelet structure
+@table @asis 
+@item @emph{Description}:
+The codelet structure describes a kernel that is possibly implemented on
+various targets.
+@item @emph{Fields}:
+@table @asis
+@item @code{where}: 
+Indicates which types of processing units are able to execute that codelet.
+@code{CORE|CUDA} for instance indicates that the codelet is implemented for
+both CPU cores and CUDA devices while @code{GORDON} indicates that it is only
+available on Cell SPUs.
+
+@item @code{core_func} (optionnal):
+Is a function pointer to the CPU implementation of the codelet. Its prototype
+must be: @code{void core_func(starpu_data_interface_t *descr, void *arg)}. The
+first argument being the array of data managed by the data management library,
+and the second argument is a pointer to the argument (possibly a copy of it)
+passed from the @code{.cl_arg} field of the @code{starpu_task} structure. This
+pointer is ignored if @code{CORE} does not appear in the @code{.where} field,
+it must be non-null otherwise.
+
+@item @code{cuda_func} (optionnal):
+Is a function pointer to the CUDA implementation of the codelet. @emph{This
+must be a host-function written in the CUDA runtime API}. Its prototype must
+be: @code{void cuda_func(starpu_data_interface_t *descr, void *arg);}. This
+pointer is ignored if @code{CUDA} does not appear in the @code{.where} field,
+it must be non-null otherwise.
+
+@item @code{gordon_func} (optionnal):
+This is the index of the Cell SPU implementation within the Gordon library.
+TODO
+
+@item @code{nbuffers}:
+Specifies the number of arguments taken by the codelet. These arguments are
+managed by the DSM and are accessed from the @code{starpu_data_interface_t *}
+array. The constant argument passed with the @code{.cl_arg} field of the
+@code{starpu_task} structure is not counted in this number.  This value should
+not be above @code{STARPU_NMAXBUFS}.
+
+@item @code{model} (optionnal):
+This is a pointer to the performance model associated to this codelet. This
+optionnal field is ignored when null. TODO
+
+@end table
+@end table
+
+@node struct starpu_task
+@subsection @code{struct starpu_task} -- StarPU task structure
+@table @asis
+@item @emph{Description}:
+The starpu_task structure describes a task that can be offloaded on the various
+processing units managed by StarPU. It instanciates a codelet. It can either be
+allocated dynamically with the @code{starpu_task_create} method, or declared
+statically. In the latter case, the programmer has to zero the
+@code{starpu_task} structure and to fill the different fields properly. The
+indicated default values correspond to the configuration of a task allocated
+with @code{starpu_task_create}.
+
+@item @emph{Fields}:
+@table @asis
+@item @code{cl}:
+Is a pointer to the corresponding @code{starpu_codelet} data structure. This
+describes where the kernel should be executed, and supplies the appropriate
+implementations. When set to @code{NULL}, no code is executed during the tasks,
+such empty tasks can be useful for synchronization purposes. 
+
+@item @code{buffers}:
+TODO
+
+@item @code{cl_arg} (optional) (default = NULL):
+TODO
+
+@item @code{cl_arg_size} (optional):
+TODO
+@c ignored if only executable on CPUs or CUDA ...
+
+@item @code{callback_func} (optional) (default = @code{NULL}):
+This is a function pointer of prototype @code{void (*f)(void *)} which
+specifies a possible callback. If that pointer is non-null, the callback
+function is executed @emph{on the host} after the execution of the task. The
+callback is passed the value contained in the @code{callback_arg} field. No
+callback is executed if that field is null.
+
+@item @code{callback_arg} (optional) (default = @code{NULL}):
+This is the pointer passed to the callback function. This field is ignored if
+the @code{callback_func} is null.
+
+@item @code{use_tag} (optional) (default = 0):
+If set, this flag indicates that the task should be associated with the tag
+conained in the @code{tag_id} field. Tag allow the application to synchronize
+with the task and to express task dependencies easily.
+
+@item @code{tag_id}:
+This fields contains the tag associated to the tag if the @code{use_tag} field
+was set, it is ignored otherwise.
+
+@item @code{synchronous}:
+If this flag is set, the @code{starpu_submit_task} function is blocking and
+returns only when the task has been executed (or if no worker is able to
+process the task). Otherwise, @code{starpu_submit_task} returns immediately.
+
+@item @code{priority} (optionnal) (default = @code{DEFAULT_PRIO}):
+This field indicates a level of priority for the task. This is an integer value
+that must be selected between @code{MIN_PRIO} (for the least important tasks)
+and @code{MAX_PRIO} (for the most important tasks) included. Default priority
+is @code{DEFAULT_PRIO}.  Scheduling strategies that take priorities into
+account can use this parameter to take better scheduling decisions, but the
+scheduling policy may also ignore it.
+
+@item @code{execute_on_a_specific_worker} (default = 0):
+If this flag is set, StarPU will bypass the scheduler and directly affect this
+task to the worker specified by the @code{workerid} field.
+
+@item @code{workerid} (optional):
+If the @code{execute_on_a_specific_worker} field is set, this field indicates
+which is the identifier of the worker that should process this task (as
+returned by @code{starpu_get_worker_id}). This field is ignored if
+@code{execute_on_a_specific_worker} field is set to 0.
+
+@item @code{detach} (optional) (default = 1):
+If this flag is set, it is not possible to synchronize with the task
+by the means of @code{starpu_wait_task} later on. Internal data structures
+are only garanteed to be liberated once @code{starpu_wait_task} is called
+if that flag is not set.
+
+@item @code{destroy} (optional) (default = 1):
+If that flag is set, the task structure will automatically be liberated, either
+after the execution of the callback if the task is detached, or during
+@code{starpu_task_wait} otherwise. If this flag is not set, dynamically allocated data
+structures will not be liberated until @code{starpu_task_destroy} is called
+explicitely. Setting this flag for a statically allocated task structure will
+result in undefined behaviour.
+
+@end table
+@end table
+
+@node starpu_task_init
+@subsection @code{starpu_task_init} -- Initialize a Task
+@table @asis
+@item @emph{Description}:
+TODO
+@item @emph{Prototype}:
+@code{void starpu_task_init(struct starpu_task *task);}
+@end table
+
 @node starpu_task_create
 @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
 @table @asis
 @item @emph{Description}:
 TODO
+(Describe the different default fields ...)
 @item @emph{Prototype}:
 @code{struct starpu_task *starpu_task_create(void);}
 @end table
 
+@node starpu_task_destroy
+@subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
+@table @asis
+@item @emph{Description}:
+Liberate the ressource allocated during starpu_task_create. This function can
+be called automatically after the execution of a task by setting the
+@code{.destroy} flag of the @code{starpu_task} structure (default behaviour).
+Calling this function on a statically allocated task results in an undefined
+behaviour.
+
+@item @emph{Prototype}:
+@code{void starpu_task_destroy(struct starpu_task *task);}
+@end table
+
+@node starpu_wait_task
+@subsection @code{starpu_wait_task} -- Wait for the termination of a Task
+@table @asis
+@item @emph{Description}:
+This function blocks until the task was executed. It is not possible to
+synchronize with a task more than once. It is not possible to wait
+synchronous or detached tasks.
+@item @emph{Return value}:
+Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
+indicates that the waited task was either synchronous or detached.
+@item @emph{Prototype}:
+@code{int starpu_wait_task(struct starpu_task *task);}
+@end table
+
+
+
+@node starpu_submit_task
+@subsection @code{starpu_submit_task} -- Submit a Task
+@table @asis
+@item @emph{Description}:
+This function submits task @code{task} to StarPU. Calling this function does
+not mean that the task will be executed immediatly as there can be data or task
+(tag) dependencies that are not fulfilled yet: StarPU will take care to
+schedule this task with respect to such dependencies.
+This function returns immediately if the @code{synchronous} field of the
+@code{starpu_task} structure was set to 0, and block until the termination of
+the task otherwise. It is also possible to synchronize the application with
+asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
+function for instance. 
+
+In case of success, this function returns 0, a return value of @code{-ENODEV}
+means that there is no worker able to process that task (eg. there is no GPU
+available and this task is only implemented on top of CUDA).
+@item @emph{Prototype}:
+@code{int starpu_submit_task(struct starpu_task *task);}
+@end table
+
+@node starpu_wait_all_tasks
+@subsection @code{starpu_wait_all_tasks} -- Wait for the termination of all Tasks
+@table @asis
+@item @emph{Description}:
+This function blocks until all the tasks that were submitted are terminated.
+
+@item @emph{Prototype}:
+@code{void starpu_wait_all_tasks(void);}
+@end table
+
 
 
 
@@ -306,33 +767,78 @@ TODO
 * starpu_tag_wait::                Block until a Tag is terminated
 * starpu_tag_wait_array::          Block until a set of Tags is terminated
 * starpu_tag_remove::              Destroy a Tag
+* starpu_tag_notify_from_apps::    Feed a tag explicitely
 @end menu
 
 
 @node starpu_tag_t 
 @subsection @code{starpu_tag_t} -- Task identifier
-@c mention the tag_id field of the task structure
 @table @asis
-@item @emph{Definition}:
-TODO
+@item @emph{Description}:
+It is possible to associate a task with a unique "tag" and to express
+dependencies between tasks by the means of those tags. To do so, fill the
+@code{tag_id} field of the @code{starpu_task} structure with a tag number (can
+be arbitrary) and set the @code{use_tag} field to 1.
+
+If @code{starpu_tag_declare_deps} is called with that tag number, the task will
+not be started until the task which wears the declared dependency tags are
+complete.
 @end table
 
 @node starpu_tag_declare_deps
 @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
 @table @asis
 @item @emph{Description}:
-TODO
+Specify the dependencies of the task identified by tag @code{id}. The first
+argument specifies the tag which is configured, the second argument gives the
+number of tag(s) on which @code{id} depends. The following arguments are the
+tags which have to terminated to unlock the task.
+
+This function must be called before the associated task is submitted to StarPU
+with @code{starpu_submit_task}.
+
+@item @emph{Remark}
+Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
+last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
+typically need to be explicitely casted. Using the
+@code{starpu_tag_declare_deps_array} function avoids this hazard.
+
 @item @emph{Prototype}:
 @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
+
+@item @emph{Example}:
+@example
+@c @cartouche
+/*  Tag 0x1 depends on tags 0x32 and 0x52 */
+starpu_tag_declare_deps((starpu_tag_t)0x1,
+        2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
+
+@c @end cartouche
+@end example
+
+
 @end table
 
 @node starpu_tag_declare_deps_array
 @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
 @table @asis
 @item @emph{Description}:
-TODO
+This function is similar to @code{starpu_tag_declare_deps}, except that its
+does not take a variable number of arguments but an array of tags of size
+@code{ndeps}.
 @item @emph{Prototype}:
 @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
+@item @emph{Example}:
+@example
+@c @cartouche
+/*  Tag 0x1 depends on tags 0x32 and 0x52 */
+starpu_tag_t tag_array[2] = @{0x32, 0x52@};
+starpu_tag_declare_deps((starpu_tag_t)0x1, 2, tag_array);
+
+@c @end cartouche
+@end example
+
+
 @end table
 
 
@@ -340,7 +846,15 @@ TODO
 @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
 @table @asis
 @item @emph{Description}:
-TODO
+This function blocks until the task associated to tag @code{id} has been
+executed. This is a blocking call which must therefore not be called within
+tasks or callbacks, but only from the application directly.  It is possible to
+synchronize with the same tag multiple times, as long as the
+@code{starpu_tag_remove} function is not called.  Note that it is still
+possible to synchronize wih a tag associated to a task which @code{starpu_task}
+data structure was liberated (eg. if the @code{destroy} flag of the
+@code{starpu_task} was enabled).
+
 @item @emph{Prototype}:
 @code{void starpu_tag_wait(starpu_tag_t id);}
 @end table
@@ -349,22 +863,40 @@ TODO
 @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
 @table @asis
 @item @emph{Description}:
-TODO
+This function is similar to @code{starpu_tag_wait} except that it blocks until
+@emph{all} the @code{ntags} tags contained in the @code{id} array are
+terminated.
 @item @emph{Prototype}:
 @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
 @end table
 
 
-
 @node starpu_tag_remove
 @subsection @code{starpu_tag_remove} -- Destroy a Tag
 @table @asis
 @item @emph{Description}:
-TODO
+This function release the resources associated to tag @code{id}. It can be
+called once the corresponding task has been executed and when there is no tag
+that depend on that one anymore.
 @item @emph{Prototype}:
 @code{void starpu_tag_remove(starpu_tag_t id);}
 @end table
 
+@node starpu_tag_notify_from_apps
+@subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitely
+@table @asis
+@item @emph{Description}:
+This function explicitely unlocks tag @code{id}. It may be useful in the
+case of applications which execute part of their computation outside StarPU
+tasks (eg. third-party libraries).  It is also provided as a
+convenient tool for the programmer, for instance to entirely construct the task
+DAG before actually giving StarPU the opportunity to execute the tasks.
+@item @emph{Prototype}:
+@code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
+@end table
+
+
+
 
 @section Extensions
 
@@ -372,8 +904,6 @@ TODO
 
 @c void starpu_malloc_pinned_if_possible(float **A, size_t dim);
 
-@c subsubsection driver API specific calls
-
 @subsection Cell extensions
 
 @c ---------------------------------------------------------------------
@@ -556,8 +1086,8 @@ Programmers can describe the data layout of their application so that StarPU is
 responsible for enforcing data coherency and availability accross the machine.
 Instead of handling complex (and non-portable) mechanisms to perform data
 movements, programmers only declare which piece of data is accessed and/or
-modified by a task, and StarPU makes sure that when a computational kernel starts
-somewhere (eg. on a GPU), its data are available locally.
+modified by a task, and StarPU makes sure that when a computational kernel
+starts somewhere (eg. on a GPU), its data are available locally.
 
 Before submitting those tasks, the programmer first needs to declare the
 different pieces of data to StarPU using the @code{starpu_register_*_data}

+ 94 - 36
examples/Makefile.am

@@ -19,19 +19,25 @@ AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/
 
 TESTS = $(check_PROGRAMS)
 
+SUBDIRS = starpufft
+
 check_PROGRAMS =
 
 BUILT_SOURCES =
 
 EXTRA_DIST = 					\
 	cuda/incrementer_cuda.cu		\
-	cuda/spmv_cuda.cu			\
+	spmv/spmv_cuda.cu			\
 	gordon/null_kernel_gordon.c		\
-	mult/gordon/func_sgemm_ibm.c
+	mult/xgemm.c				\
+	mult/xgemm_kernels.c			\
+	mult/gordon/func_sgemm_ibm.c		\
+	lu/xlu.c				\
+	lu/xlu_pivot.c				\
+	lu/xlu_kernels.c			\
+	lu/lu_example.c
 
 CLEANFILES = 					\
-	cuda/incrementer_cuda.cubin		\
-	cuda/spmv_cuda.cubin			\
 	gordon/null_kernel_gordon.spuelf	\
 	mult/gordon/func_sgemm_ibm.spuelf
 	
@@ -51,10 +57,6 @@ NVCC ?= nvcc
 	$(NVCC) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir}
 
 
-BUILT_SOURCES +=				\
-	cuda/incrementer_cuda.cubin		\
-	cuda/spmv_cuda.cubin			
-
 endif
 
 if USE_GORDON
@@ -88,6 +90,10 @@ noinst_HEADERS = 				\
 	heat/dw_sparse_cg.h			\
 	heat/heat.h				\
 	heat/dw_factolu.h			\
+	lu/xlu.h				\
+	lu/xlu_kernels.h			\
+	lu/float.h				\
+	lu/double.h				\
 	cholesky/dw_cholesky_models.h		\
 	cholesky/dw_cholesky.h			\
 	common/blas_model.h			\
@@ -145,37 +151,53 @@ examplebin_PROGRAMS +=				\
 ppm_downscaler_yuv_downscaler_SOURCES =		\
 	ppm-downscaler/yuv-downscaler.c
 
+
+################
+# AXPY example #
+################
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS +=				\
+	axpy/axpy
+
+axpy_axpy_SOURCES =				\
+	axpy/axpy.c				\
+	common/blas.c
+
+endif
+
 ################
 # Mult example #
 ################
 
 if !NO_BLAS_LIB
 
-examplebin_PROGRAMS += 			\
-	mult/dw_mult 				\
+examplebin_PROGRAMS += 				\
+	mult/sgemm 				\
+	mult/dgemm 				\
 	mult/dw_mult_no_stride			\
 	mult/dw_mult_no_stride_no_tag 
 
-mult_dw_mult_SOURCES = 				\
-	mult/dw_mult.c				\
+mult_sgemm_SOURCES = 				\
+	mult/sgemm.c				\
+	common/blas.c				\
+	common/blas_model.c
+	
+mult_dgemm_SOURCES = 				\
+	mult/dgemm.c				\
 	common/blas.c				\
 	common/blas_model.c
 		
 mult_dw_mult_no_stride_SOURCES = 		\
 	mult/dw_mult_no_stride.c		\
+	mult/sgemm_kernels.c			\
 	common/blas.c				\
 	common/blas_model.c
 
 mult_dw_mult_no_stride_no_tag_SOURCES =		\
 	mult/dw_mult_no_stride_no_tag.c		\
-	common/blas.c				\
-	common/blas_model.c
-
-examplebin_PROGRAMS +=				\
-	mult/dw_mult_no_filters
-
-mult_dw_mult_no_filters_SOURCES =		\
-	mult/dw_mult_no_filters.c		\
+	mult/sgemm_kernels.c			\
 	common/blas.c				\
 	common/blas_model.c
 
@@ -189,7 +211,8 @@ if !NO_BLAS_LIB
 
 examplebin_PROGRAMS += 			\
 	cholesky/dw_cholesky			\
-	cholesky/dw_cholesky_no_stride
+	cholesky/dw_cholesky_no_stride		\
+	cholesky/dw_cholesky_grain
 
 cholesky_dw_cholesky_SOURCES =			\
 	cholesky/dw_cholesky.c			\
@@ -203,8 +226,41 @@ cholesky_dw_cholesky_no_stride_SOURCES =	\
 	cholesky/dw_cholesky_kernels.c		\
 	common/blas.c
 
+cholesky_dw_cholesky_grain_SOURCES =		\
+	cholesky/dw_cholesky_grain.c		\
+	cholesky/dw_cholesky_models.c		\
+	cholesky/dw_cholesky_kernels.c		\
+	common/blas.c
+
+endif
+
+##############
+# LU example #
+##############
+
+if !NO_BLAS_LIB
+
+examplebin_PROGRAMS += 				\
+	lu/lu_example_float			\
+	lu/lu_example_double
+
+lu_lu_example_float_SOURCES =			\
+	lu/lu_example_float.c			\
+	lu/slu.c				\
+	lu/slu_pivot.c				\
+	lu/slu_kernels.c			\
+	common/blas.c
+
+lu_lu_example_double_SOURCES =			\
+	lu/lu_example_double.c			\
+	lu/dlu.c				\
+	lu/dlu_pivot.c				\
+	lu/dlu_kernels.c			\
+	common/blas.c
+
 endif
 
+
 ################
 # Heat example #
 ################
@@ -219,6 +275,7 @@ heat_heat_SOURCES =				\
 	heat/heat.c				\
 	heat/dw_factolu.c			\
 	heat/dw_factolu_tag.c			\
+	heat/dw_factolu_grain.c			\
 	heat/dw_sparse_cg.c			\
 	heat/heat_display.c			\
 	heat/lu_kernels_model.c			\
@@ -235,12 +292,14 @@ endif
 check_PROGRAMS +=			\
 	tag_example/tag_example			\
 	tag_example/tag_example3			\
-	tag_example/tag_example2
+	tag_example/tag_example2	\
+	tag_example/tag_restartable
 
 examplebin_PROGRAMS +=			\
 	tag_example/tag_example			\
 	tag_example/tag_example3		\
-	tag_example/tag_example2
+	tag_example/tag_example2	\
+	tag_example/tag_restartable
 
 tag_example_tag_example_SOURCES =		\
 	tag_example/tag_example.c
@@ -251,6 +310,9 @@ tag_example_tag_example2_SOURCES =		\
 tag_example_tag_example3_SOURCES =		\
 	tag_example/tag_example2.c
 
+tag_example_tag_restartable_SOURCES =		\
+	tag_example/tag_restartable.c
+
 ####################
 # Strassen example #
 ####################
@@ -296,7 +358,8 @@ examplebin_PROGRAMS += 				\
 	spmv/dw_block_spmv
 
 spmv_dw_spmv_SOURCES = 				\
-	spmv/dw_spmv.c
+	spmv/dw_spmv.c				\
+	spmv/spmv_cuda.cu
 
 spmv_dw_block_spmv_SOURCES =			\
 	spmv/dw_block_spmv.c			\
@@ -312,21 +375,16 @@ endif
 
 
 check_PROGRAMS +=				\
-	incrementer/incrementer			\
-	incrementer/incrementer_runtime
+	incrementer/incrementer
 
 examplebin_PROGRAMS +=				\
-	incrementer/incrementer			\
-	incrementer/incrementer_runtime
-
-incrementer_incrementer_SOURCES =		\
-	incrementer/incrementer.c
+	incrementer/incrementer
 
 if USE_CUDA
-incrementer_incrementer_runtime_SOURCES =	\
-	incrementer/incrementer_runtime.c	\
-	incrementer/incrementer_runtime_kernels.cu
+incrementer_incrementer_SOURCES =	\
+	incrementer/incrementer.c	\
+	incrementer/incrementer_kernels.cu
 else
-incrementer_incrementer_runtime_SOURCES =	\
-	incrementer/incrementer_runtime.c
+incrementer_incrementer_SOURCES =	\
+	incrementer/incrementer.c
 endif

+ 20 - 0
examples/audio/Makefile

@@ -0,0 +1,20 @@
+CFLAGS += -Wall -g3 -gdwarf-2 -O3 
+
+LIBS+=$$(pkg-config --libs libstarpu) -lcufft
+CFLAGS+=$$(pkg-config --cflags libstarpu)
+
+LIBS+=$$(pkg-config --libs fftw3f)
+CFLAGS+=$$(pkg-config --cflags fftw3f)
+
+all: starpu-audio-processing
+
+starpu-audio-processing.o: starpu-audio-processing.c
+
+starpu-audio-processing: starpu-audio-processing.o
+	$(CC) $(LDFLAGS) starpu-audio-processing.o -o starpu-audio-processing $(LIBS)
+
+clean:
+	rm -f *.o
+	rm -f starpu-audio-processing
+	rm -f input.dat input.raw
+	rm -f output.dat output.wav output.raw

BIN=BIN
examples/audio/input.wav


+ 441 - 0
examples/audio/starpu-audio-processing.c

@@ -0,0 +1,441 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+#include <fftw3.h>
+#ifdef USE_CUDA
+#include <cufft.h>
+#endif
+
+//#define SAVE_RAW	1
+
+#define DEFAULTINPUTFILE	"input.wav"
+#define DEFAULTOUTPUTFILE	"output.wav"
+#define NSAMPLES	(256*1024)
+#define SAMPLERATE	44100
+
+static unsigned nsamples = NSAMPLES;
+
+/* This is a band filter, we want to stop everything that is not between LOWFREQ and HIGHFREQ*/
+/* LOWFREQ < i * SAMPLERATE / NSAMPLE */
+#define LOWFREQ	500U
+#define HIFREQ	800U
+
+static const size_t headersize = 37+9;
+
+static FILE *infile, *outfile;
+static FILE *infile_raw, *outfile_raw;
+static char *inputfilename = DEFAULTINPUTFILE;
+static char *outputfilename = DEFAULTOUTPUTFILE;
+static unsigned use_pin = 0;
+
+unsigned length_data;
+
+/* buffer containing input WAV data */
+float *A;
+
+starpu_data_handle A_handle;
+
+/* For performance evaluation */
+static struct timeval start;
+static struct timeval end;
+static unsigned task_per_worker[STARPU_NMAXWORKERS] = {0};
+
+/* 
+ *	Functions to Manipulate WAV files 
+ */
+
+unsigned get_wav_data_bytes_length(FILE *file)
+{
+	/* this is clearly suboptimal !! */
+	fseek(file, headersize, SEEK_SET);
+
+	unsigned cnt = 0;
+	while (fgetc(file) != EOF)
+		cnt++;
+
+	return cnt;
+}
+
+void copy_wav_header(FILE *srcfile, FILE *dstfile)
+{
+	unsigned char buffer[128];
+
+	fseek(srcfile, 0, SEEK_SET);
+	fseek(dstfile, 0, SEEK_SET);
+
+	fread(buffer, 1, headersize, infile);	
+	fwrite(buffer, 1, headersize, outfile);	
+}
+
+void read_16bit_wav(FILE *infile, unsigned size, float *arrayout, FILE *save_file)
+{
+	int v;
+#if SAVE_RAW
+	unsigned currentpos = 0;
+#endif
+
+	/* we skip the header to only keep the data */
+	fseek(infile, headersize, SEEK_SET);
+	
+	for (v=0;v<size;v++) {
+		signed char val = (signed char)fgetc(infile);
+		signed char val2 = (signed char)fgetc(infile);
+
+		arrayout[v] = 256*val2 + val;
+
+#if SAVE_RAW
+		fprintf(save_file, "%d %f\n", currentpos++, arrayout[v]);
+#endif
+	}
+}
+
+/* we only write the data, not the header !*/
+void write_16bit_wav(FILE *outfile, unsigned size, float *arrayin, FILE *save_file)
+{
+	int v;
+#if SAVE_RAW
+	unsigned currentpos = 0;
+#endif
+
+	/* we assume that the header is copied using copy_wav_header */
+	fseek(outfile, headersize, SEEK_SET);
+	
+	for (v=0;v<size;v++) {
+		signed char val = ((int)arrayin[v]) % 256; 
+		signed char val2  = ((int)arrayin[v]) / 256;
+
+		fputc(val, outfile);
+		fputc(val2, outfile);
+
+#if SAVE_RAW
+		if (save_file)
+	                fprintf(save_file, "%d %f\n", currentpos++, arrayin[v]);
+#endif
+	}
+}
+
+
+/*
+ *
+ *	The actual kernels
+ *
+ */
+
+/* we don't reinitialize the CUFFT plan for every kernel, so we "cache" it */
+typedef struct {
+	unsigned is_initialized;
+#ifdef USE_CUDA
+	cufftHandle plan;
+	cufftHandle inv_plan;
+	cufftComplex *localout;
+#endif
+	fftwf_complex *localout_cpu;
+	float *Acopy;
+	fftwf_plan plan_cpu;
+	fftwf_plan inv_plan_cpu;
+} fft_plan_cache;
+
+static fft_plan_cache plans[STARPU_NMAXWORKERS];
+
+#ifdef USE_CUDA
+static void band_filter_kernel_gpu(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	cufftResult cures;
+
+	float *localA = (float *)descr[0].vector.ptr;
+	cufftComplex *localout;
+
+	int workerid = starpu_get_worker_id();
+	
+	/* initialize the plane only during the first iteration */
+	if (!plans[workerid].is_initialized)
+	{
+		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+		cudaMalloc((void **)&plans[workerid].localout,
+					nsamples*sizeof(cufftComplex));
+		STARPU_ASSERT(plans[workerid].localout);
+
+		plans[workerid].is_initialized = 1;
+	}
+
+	localout = plans[workerid].localout;
+
+	/* FFT */
+	cures = cufftExecR2C(plans[workerid].plan, localA, localout);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	
+	/* filter low freqs */
+	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
+	cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
+
+	/* filter high freqs */
+	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
+	cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
+
+	/* inverse FFT */
+	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	/* FFTW does not normalize its output ! */
+	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+}
+#endif
+
+static pthread_mutex_t fftw_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void band_filter_kernel_cpu(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	float *localA = (float *)descr[0].vector.ptr;
+
+	int workerid = starpu_get_worker_id();
+	
+	/* initialize the plane only during the first iteration */
+	if (!plans[workerid].is_initialized)
+	{
+		plans[workerid].localout_cpu = malloc(nsamples*sizeof(fftwf_complex)); 
+		plans[workerid].Acopy = malloc(nsamples*sizeof(float));
+
+		/* create plans, only "fftwf_execute" is thread safe in FFTW ... */
+		pthread_mutex_lock(&fftw_mutex);
+		plans[workerid].plan_cpu = fftwf_plan_dft_r2c_1d(nsamples,
+					plans[workerid].Acopy,
+					plans[workerid].localout_cpu,
+					FFTW_ESTIMATE);
+		plans[workerid].inv_plan_cpu = fftwf_plan_dft_c2r_1d(nsamples,
+					plans[workerid].localout_cpu,
+					plans[workerid].Acopy,
+					FFTW_ESTIMATE);
+		pthread_mutex_unlock(&fftw_mutex);
+
+		plans[workerid].is_initialized = 1;
+	}
+
+	fftwf_complex *localout = plans[workerid].localout_cpu;
+
+	/* copy data into the temporary buffer */
+	memcpy(plans[workerid].Acopy, localA, nsamples*sizeof(float));
+
+	/* FFT */
+	fftwf_execute(plans[workerid].plan_cpu);
+	
+	/* filter low freqs */
+	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
+	memset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
+
+	/* filter high freqs */
+	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
+	memset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
+
+	/* inverse FFT */
+	fftwf_execute(plans[workerid].inv_plan_cpu);
+
+	/* copy data into the temporary buffer */
+	memcpy(localA, plans[workerid].Acopy, nsamples*sizeof(float));
+
+	/* FFTW does not normalize its output ! */
+	/* TODO use BLAS ?*/
+	int i;
+	for (i = 0; i < nsamples; i++)
+		localA[i] /= nsamples;
+}
+
+struct starpu_perfmodel_t band_filter_model = {
+	.type = HISTORY_BASED,
+	.symbol = "FFT_band_filter"
+};
+
+static starpu_codelet band_filter_cl = {
+	.where = CORE|CUDA,
+#ifdef USE_CUDA
+	.cuda_func = band_filter_kernel_gpu,
+#endif
+	.core_func = band_filter_kernel_cpu,
+	.model = &band_filter_model,
+	.nbuffers = 1
+};
+
+void callback(void *arg)
+{
+	/* do some accounting */
+	int id = starpu_get_worker_id();
+	task_per_worker[id]++;
+}
+
+void create_starpu_task(unsigned iter)
+{
+	struct starpu_task *task = starpu_task_create();
+
+	task->cl = &band_filter_cl;
+
+	task->buffers[0].handle = get_sub_data(A_handle, 1, iter);
+	task->buffers[0].mode = STARPU_RW;
+
+	task->callback_func = callback;
+	task->callback_arg = NULL;
+
+	starpu_submit_task(task);
+}
+
+static void init_problem(void)
+{
+	infile = fopen(inputfilename, "r");
+
+	if (outputfilename)
+		outfile = fopen(outputfilename, "w+");
+
+#if SAVE_RAW
+	infile_raw = fopen("input.raw", "w");
+	outfile_raw = fopen("output.raw", "w");
+#endif
+
+	/* copy input's header into output WAV  */
+	if (outputfilename)
+		copy_wav_header(infile, outfile);
+
+	/* read length of input WAV's data */
+	/* each element is 2 bytes long (16bits)*/
+	length_data = get_wav_data_bytes_length(infile)/2;
+
+	/* allocate a buffer to store the content of input file */
+	if (use_pin)
+	{
+		starpu_malloc_pinned_if_possible((void **)&A, length_data*sizeof(float));
+	}
+	else {
+		A = malloc(length_data*sizeof(float));
+	}
+
+	/* allocate working buffer (this could be done online, but we'll keep it simple) */
+	//starpu_malloc_pinned_if_possible((void **)&outdata, length_data*sizeof(fftwf_complex));
+
+	/* read input data into buffer "A" */
+	read_16bit_wav(infile, length_data, A, infile_raw);
+}
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-h") == 0) {
+			fprintf(stderr, "Usage: %s [-pin] [-nsamples block_size] [-i input.wav] [-o output.wav | -no-output] [-h]\n", argv[0]);
+			exit(-1);
+		}
+
+		if (strcmp(argv[i], "-i") == 0) {
+			inputfilename = argv[++i];;
+		}
+
+		if (strcmp(argv[i], "-o") == 0) {
+			outputfilename = argv[++i];;
+		}
+
+		if (strcmp(argv[i], "-no-output") == 0) {
+			outputfilename = NULL;;
+		}
+
+		/* block size */
+		if (strcmp(argv[i], "-nsamples") == 0) {
+			char *argptr;
+			nsamples = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-pin") == 0) {
+			use_pin = 1;
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	unsigned iter;
+
+	parse_args(argc, argv);
+
+	fprintf(stderr, "Reading input data\n");
+
+	init_problem();
+
+	unsigned niter = length_data/nsamples;
+
+	fprintf(stderr, "input: %s\noutput: %s\n#chunks %d\n", inputfilename, outputfilename, niter);
+
+	/* launch StarPU */
+	starpu_init(NULL);
+
+	starpu_register_vector_data(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
+
+	starpu_filter f = 
+	{
+		.filter_func = starpu_block_filter_func_vector,
+		.filter_arg = niter
+	};
+
+	starpu_partition_data(A_handle, &f);
+
+	for (iter = 0; iter < niter; iter++)
+		starpu_data_set_wb_mask(get_sub_data(A_handle, 1, iter), 1<<0);
+
+	gettimeofday(&start, NULL);
+
+	for (iter = 0; iter < niter; iter++)
+	{
+		create_starpu_task(iter);
+	}
+
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took %2.2f ms\n", timing/1000);
+
+	int worker;
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+
+			unsigned long bytes = nsamples*sizeof(float)*task_per_worker[worker];
+
+			fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/timing, (100.0*task_per_worker[worker])/niter);
+		}
+	}
+
+	if (outputfilename)
+		fprintf(stderr, "Writing output data\n");
+
+	/* make sure that the output is in RAM before quitting StarPU */
+	starpu_unpartition_data(A_handle, 0);
+	starpu_delete_data(A_handle);
+
+	/* we are done ! */
+	starpu_shutdown();
+
+	fclose(infile);
+
+	if (outputfilename)
+	{
+		write_16bit_wav(outfile, length_data, A, outfile_raw);
+		fclose(outfile);
+	}
+
+#if SAVE_RAW
+	fclose(infile_raw);
+	fclose(outfile_raw);
+#endif
+
+	return 0;
+}

+ 162 - 0
examples/axpy/axpy.c

@@ -0,0 +1,162 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <sys/time.h>
+
+
+#include <common/blas.h>
+
+#define TYPE	float
+#define AXPY	SAXPY
+#define CUBLASAXPY	cublasSaxpy
+
+#define N	(16*1024*1024)
+
+#define NBLOCKS	8
+
+TYPE *vec_x, *vec_y;
+
+/* descriptors for StarPU */
+starpu_data_handle handle_y, handle_x;
+
+void axpy_cpu(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	TYPE alpha = *((TYPE *)arg);
+
+	unsigned n = descr[0].vector.nx;
+
+	TYPE *block_x = (TYPE *)descr[0].vector.ptr;
+	TYPE *block_y = (TYPE *)descr[1].vector.ptr;
+
+	AXPY((int)n, alpha, block_x, 1, block_y, 1);
+}
+
+#ifdef USE_CUDA
+void axpy_gpu(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	TYPE alpha = *((TYPE *)arg);
+
+	unsigned n = descr[0].vector.nx;
+
+	TYPE *block_x = (TYPE *)descr[0].vector.ptr;
+	TYPE *block_y = (TYPE *)descr[1].vector.ptr;
+
+	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
+	cudaThreadSynchronize();
+}
+#endif
+
+static starpu_codelet axpy_cl = {
+        .where =
+#ifdef USE_CUDA
+                CUDA|
+#endif
+                CORE,
+
+	.core_func = axpy_cpu,
+#ifdef USE_CUDA
+	.cuda_func = axpy_gpu,
+#endif
+	.nbuffers = 2
+};
+
+int main(int argc, char **argv)
+{
+	/* Initialize StarPU */
+	starpu_init(NULL);
+
+	starpu_helper_init_cublas();
+
+	/* This is equivalent to 
+		vec_a = malloc(N*sizeof(TYPE));
+		vec_b = malloc(N*sizeof(TYPE));
+	*/
+	starpu_malloc_pinned_if_possible((void **)&vec_x, N*sizeof(TYPE));
+	assert(vec_x);
+
+	starpu_malloc_pinned_if_possible((void **)&vec_y, N*sizeof(TYPE));
+	assert(vec_y);
+
+	unsigned i;
+	for (i = 0; i < N; i++)
+	{
+		vec_x[i] = 1.0f;//(TYPE)drand48();
+		vec_y[i] = 4.0f;//(TYPE)drand48();
+	}
+
+	fprintf(stderr, "BEFORE x[0] = %2.2f\n", vec_x[0]);
+	fprintf(stderr, "BEFORE y[0] = %2.2f\n", vec_y[0]);
+
+	/* Declare the data to StarPU */
+	starpu_register_vector_data(&handle_x, 0, (uintptr_t)vec_x, N, sizeof(TYPE));
+	starpu_register_vector_data(&handle_y, 0, (uintptr_t)vec_y, N, sizeof(TYPE));
+
+	/* Divide the vector into blocks */
+	starpu_filter block_filter = {
+		.filter_func = starpu_block_filter_func_vector,
+		.filter_arg = NBLOCKS
+	};
+
+	starpu_partition_data(handle_x, &block_filter);
+	starpu_partition_data(handle_y, &block_filter);
+
+	TYPE alpha = 3.41;
+
+	struct timeval start;
+	struct timeval end;
+	
+	gettimeofday(&start, NULL);
+
+	unsigned b;
+	for (b = 0; b < NBLOCKS; b++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &axpy_cl;
+
+		task->cl_arg = &alpha;
+
+		task->buffers[0].handle = get_sub_data(handle_x, 1, b);
+		task->buffers[0].mode = STARPU_R;
+		
+		task->buffers[1].handle = get_sub_data(handle_y, 1, b);
+		task->buffers[1].mode = STARPU_RW;
+		
+		starpu_submit_task(task);
+	}
+
+	starpu_wait_all_tasks();
+
+	starpu_unpartition_data(handle_y, 0);
+	starpu_delete_data(handle_y);
+
+	gettimeofday(&end, NULL);
+        double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
+                                        (end.tv_usec - start.tv_usec));
+
+	fprintf(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing);
+
+	fprintf(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", vec_y[0], alpha);
+
+	/* Stop StarPU */
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 12
examples/cholesky/dw_cholesky.c

@@ -37,10 +37,10 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 static starpu_codelet cl11 =
 {
-	.where = ANY,
+	.where = CORE|CUDA,
 	.core_func = chol_core_codelet_update_u11,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u11,
+	.cuda_func = chol_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
 	.model = &chol_model_11
@@ -59,7 +59,8 @@ static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 	task->buffers[0].mode = STARPU_RW;
 
 	/* this is an important task */
-	task->priority = MAX_PRIO;
+	if (!noprio)
+		task->priority = MAX_PRIO;
 
 	/* enforce dependencies ... */
 	if (k > 0) {
@@ -71,10 +72,10 @@ static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k)
 
 static starpu_codelet cl21 =
 {
-	.where = ANY,
+	.where = CORE|CUDA,
 	.core_func = chol_core_codelet_update_u21,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u21,
+	.cuda_func = chol_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
 	.model = &chol_model_21
@@ -92,7 +93,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
 	task->buffers[1].mode = STARPU_RW;
 
-	if (j == k+1) {
+	if (!noprio && (j == k+1)) {
 		task->priority = MAX_PRIO;
 	}
 
@@ -109,10 +110,10 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 
 static starpu_codelet cl22 =
 {
-	.where = ANY,
+	.where = CORE|CUDA,
 	.core_func = chol_core_codelet_update_u22,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u22,
+	.cuda_func = chol_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,
 	.model = &chol_model_22
@@ -134,7 +135,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->buffers[2].handle = get_sub_data(dataA, 2, i, j); 
 	task->buffers[2].mode = STARPU_RW;
 
-	if ( (i == k + 1) && (j == k +1) ) {
+	if (!noprio && (i == k + 1) && (j == k +1) ) {
 		task->priority = MAX_PRIO;
 	}
 
@@ -170,6 +171,7 @@ static void _dw_cholesky(starpu_data_handle dataA, unsigned nblocks)
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
+	gettimeofday(&start, NULL);
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -195,13 +197,16 @@ static void _dw_cholesky(starpu_data_handle dataA, unsigned nblocks)
 	}
 
 	/* schedule the codelet */
-	gettimeofday(&start, NULL);
 	starpu_submit_task(entry_task);
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
+
+	starpu_unpartition_data(dataA, 0);
+
 	gettimeofday(&end, NULL);
 
+
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 	fprintf(stderr, "Computation took (in ms)\n");
 	printf("%2.2f\n", timing/1000);
@@ -215,12 +220,14 @@ static void _dw_cholesky(starpu_data_handle dataA, unsigned nblocks)
 void initialize_system(float **A, unsigned dim, unsigned pinned)
 {
 	starpu_init(NULL);
+	
+	starpu_helper_init_cublas();
 
 	timing_init();
 
 	if (pinned)
 	{
-		starpu_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
+		starpu_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
 	} 
 	else {
 		*A = malloc(dim*dim*sizeof(float));
@@ -247,7 +254,7 @@ void dw_cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	_dw_cholesky(dataA, nblocks);
 
-	starpu_unpartition_data(dataA, 0);
+	starpu_helper_shutdown_cublas();
 
 	starpu_shutdown();
 }

+ 27 - 3
examples/cholesky/dw_cholesky.h

@@ -20,12 +20,14 @@
 #include <semaphore.h>
 #include <string.h>
 #include <math.h>
+#include <sys/time.h>
 #ifdef USE_CUDA
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <cublas.h>
 #endif
 
-#include "../common/blas.h"
+#include <common/blas.h>
 #include <starpu.h>
 
 #define NMAXBLOCKS	32
@@ -37,6 +39,17 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+
+
+#define TAG11_AUX(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
+#define TAG21_AUX(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  			\
+					|  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22_AUX(k,i,j, prefix)    ((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)	\
+					|  ((4ULL<<56) | ((unsigned long long)(k)<<32)  	\
+					| ((unsigned long long)(i)<<16) 			\
+					| (unsigned long long)(j))))
+
 #define BLOCKSIZE	(size/nblocks)
 
 
@@ -53,9 +66,11 @@ typedef struct {
 	sem_t *sem;
 } cl_args;
 
-static unsigned size = 4*1024;
-static unsigned nblocks = 4;
+static unsigned size = 16*1024;
+static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
 static unsigned pinned = 0;
+static unsigned noprio = 0;
 
 void chol_core_codelet_update_u11(starpu_data_interface_t *, void *);
 void chol_core_codelet_update_u21(starpu_data_interface_t *, void *);
@@ -88,10 +103,19 @@ static void __attribute__((unused)) parse_args(int argc, char **argv)
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 
+		if (strcmp(argv[i], "-nbigblocks") == 0) {
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
 		if (strcmp(argv[i], "-pin") == 0) {
 			pinned = 1;
 		}
 
+		if (strcmp(argv[i], "-no-prio") == 0) {
+			noprio = 1;
+		}
+
 		if (strcmp(argv[i], "-h") == 0) {
 			printf("usage : %s [-pin] [-size size] [-nblocks nblocks]\n", argv[0]);
 		}

+ 377 - 0
examples/cholesky/dw_cholesky_grain.c

@@ -0,0 +1,377 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_cholesky.h"
+#include "dw_cholesky_models.h"
+
+/*
+ *	Some useful functions
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+		task->use_tag = 1;
+		task->tag_id = id;
+
+	return task;
+}
+
+/*
+ *	Create the codelets
+ */
+
+static starpu_codelet cl11 =
+{
+	.where = CORE|CUDA,
+	.core_func = chol_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &chol_model_11
+};
+
+static struct starpu_task * create_task_11(starpu_data_handle dataA, unsigned k, unsigned reclevel)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11_AUX(k, reclevel));
+	
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11_AUX(k, reclevel), 1, TAG22_AUX(k-1, k, k, reclevel));
+	}
+
+	return task;
+}
+
+static starpu_codelet cl21 =
+{
+	.where = CORE|CUDA,
+	.core_func = chol_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &chol_model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned reclevel)
+{
+	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+
+	task->cl = &cl21;	
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (j == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl22 =
+{
+	.where = CORE|CUDA,
+	.core_func = chol_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cuda_func = chol_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &chol_model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, i); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = STARPU_RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+	}
+
+	starpu_submit_task(task);
+}
+
+
+
+/*
+ *	code to bootstrap the factorization 
+ *	and construct the DAG
+ */
+
+static void _dw_cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks, unsigned reclevel)
+{
+	/* create a new codelet */
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_register_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	for (k = 0; k < nbigblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, reclevel);
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (j = k+1; j<nblocks; j++)
+		{
+			create_task_21(dataA, k, j, reclevel);
+
+			for (i = k+1; i<nblocks; i++)
+			{
+				if (i <= j)
+					create_task_22(dataA, k, i, j, reclevel);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	int ret = starpu_submit_task(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		fprintf(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+	if (nblocks == nbigblocks)
+	{
+		/* stall the application until the end of computations */
+		starpu_tag_wait(TAG11_AUX(nblocks-1, reclevel));
+		starpu_unpartition_data(dataA, 0);
+		return;
+	}
+	else {
+		STARPU_ASSERT(reclevel == 0);
+		unsigned ndeps_tags = (nblocks - nbigblocks)*(nblocks - nbigblocks);
+
+		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
+		STARPU_ASSERT(tag_array);
+
+		unsigned ind = 0;
+		for (i = nbigblocks; i < nblocks; i++)
+		for (j = nbigblocks; j < nblocks; j++)
+		{
+			if (i <= j)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+		}
+
+		starpu_tag_wait_array(ind, tag_array);
+
+		free(tag_array);
+
+		starpu_unpartition_data(dataA, 0);
+		starpu_delete_data(dataA);
+
+		float *newmatA = &matA[nbigblocks*(size/nblocks)*(ld+1)];
+
+		_dw_cholesky_grain(newmatA, size/nblocks*(nblocks - nbigblocks), ld, (nblocks - nbigblocks)*2, (nblocks - nbigblocks)*2, reclevel+1);
+	}
+}
+
+void initialize_system(float **A, unsigned dim, unsigned pinned)
+{
+	starpu_init(NULL);
+
+	starpu_helper_init_cublas();
+
+	timing_init();
+
+	if (pinned)
+	{
+		starpu_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
+	} 
+	else {
+		*A = malloc(dim*dim*sizeof(float));
+	}
+}
+
+void dw_cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	gettimeofday(&start, NULL);
+
+	_dw_cholesky_grain(matA, size, ld, nblocks, nbigblocks, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	double flop = (1.0f*size*size*size)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	starpu_helper_shutdown_cublas();
+
+	starpu_shutdown();
+}
+
+int main(int argc, char **argv)
+{
+	/* create a simple definite positive symetric matrix example
+	 *
+	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
+	 * */
+
+	parse_args(argc, argv);
+
+	float *mat;
+
+	mat = malloc(size*size*sizeof(float));
+	initialize_system(&mat, size, pinned);
+
+	unsigned i,j;
+	for (i = 0; i < size; i++)
+	{
+		for (j = 0; j < size; j++)
+		{
+			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
+			//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+		}
+	}
+
+
+#ifdef CHECK_OUTPUT
+	printf("Input :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+
+	dw_cholesky_grain(mat, size, size, nblocks, nbigblocks);
+
+#ifdef CHECK_OUTPUT
+	printf("Results :\n");
+
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+				mat[j+i*size] = 0.0f; // debug
+			}
+		}
+		printf("\n");
+	}
+
+	fprintf(stderr, "compute explicit LLt ...\n");
+	float *test_mat = malloc(size*size*sizeof(float));
+	STARPU_ASSERT(test_mat);
+
+	SSYRK("L", "N", size, size, 1.0f, 
+				mat, size, 0.0f, test_mat, size);
+
+	fprintf(stderr, "comparing results ...\n");
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			if (i <= j) {
+				printf("%2.2f\t", test_mat[j +i*size]);
+			}
+			else {
+				printf(".\t");
+			}
+		}
+		printf("\n");
+	}
+#endif
+
+	return 0;
+}

+ 23 - 5
examples/cholesky/dw_cholesky_kernels.c

@@ -14,8 +14,14 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <starpu_config.h>
 #include "dw_cholesky.h"
 #include "../common/blas.h"
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cublas.h>
+#endif
 
 /*
  *   U22 
@@ -36,6 +42,10 @@ static inline void chol_common_core_codelet_update_u22(starpu_data_interface_t *
 	unsigned ld12 = buffers[1].blas.ld;
 	unsigned ld22 = buffers[2].blas.ld;
 
+#ifdef USE_CUDA
+	cublasStatus st;
+#endif
+
 	switch (s) {
 		case 0:
 			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
@@ -46,6 +56,11 @@ static inline void chol_common_core_codelet_update_u22(starpu_data_interface_t *
 			cublasSgemm('n', 't', dy, dx, dz, 
 					-1.0f, left, ld21, right, ld12, 
 					 1.0f, center, ld22);
+			st = cublasGetError();
+			STARPU_ASSERT(!st);
+
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
@@ -92,6 +107,7 @@ static inline void chol_common_codelet_update_u21(starpu_data_interface_t *buffe
 #ifdef USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
+			cudaThreadSynchronize();
 			break;
 #endif
 		default:
@@ -157,28 +173,30 @@ static inline void chol_common_codelet_update_u11(starpu_data_interface_t *descr
 			for (z = 0; z < nx; z++)
 			{
 				float lambda11;
-				/* ok that's dirty and ridiculous ... */
-				cublasGetVector(1, sizeof(float), &sub11[z+z*ld], sizeof(float), &lambda11, sizeof(float));
+				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
 
+				STARPU_ASSERT(lambda11 != 0.0f);
+				
 				lambda11 = sqrt(lambda11);
 
 				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
 
-				STARPU_ASSERT(lambda11 != 0.0f);
-				
 				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
 
 				cublasSsyr('U', nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
+		
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
 			STARPU_ASSERT(0);
 			break;
 	}
-
 }
 
 

+ 11 - 6
examples/cholesky/dw_cholesky_no_stride.c

@@ -47,10 +47,10 @@ static void terminal_callback(void *argcb)
 
 static starpu_codelet cl11 =
 {
-	.where = ANY,
+	.where = CORE|CUDA|GORDON,
 	.core_func = chol_core_codelet_update_u11,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u11,
+	.cuda_func = chol_cublas_codelet_update_u11,
 #endif
 #ifdef USE_GORDON
 #ifdef SPU_FUNC_POTRF
@@ -94,10 +94,10 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks, sem_t *
 
 static starpu_codelet cl21 =
 {
-	.where = ANY,
+	.where = CORE|CUDA|GORDON,
 	.core_func = chol_core_codelet_update_u21,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u21,
+	.cuda_func = chol_cublas_codelet_update_u21,
 #endif
 #ifdef USE_GORDON
 #ifdef SPU_FUNC_STRSM
@@ -139,10 +139,10 @@ static void create_task_21(unsigned k, unsigned j)
 
 static starpu_codelet cl22 =
 {
-	.where = ANY,
+	.where = CORE|CUDA|GORDON,
 	.core_func = chol_core_codelet_update_u22,
 #ifdef USE_CUDA
-	.cublas_func = chol_cublas_codelet_update_u22,
+	.cuda_func = chol_cublas_codelet_update_u22,
 #endif
 #ifdef USE_GORDON
 #ifdef SPU_FUNC_SGEMM
@@ -258,6 +258,9 @@ int main(int argc, char **argv)
 	fprintf(stderr, "BLOCK SIZE = %d\n", size / nblocks);
 
 	starpu_init(NULL);
+
+	starpu_helper_init_cublas();
+
 	timing_init();
 
 	for (y = 0; y < nblocks; y++)
@@ -315,6 +318,8 @@ int main(int argc, char **argv)
 
 	dw_cholesky_no_stride();
 
+	starpu_helper_shutdown_cublas();
+
 	starpu_shutdown();
 	return 0;
 }

+ 142 - 3
examples/common/blas.c

@@ -39,16 +39,37 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
 }
 
+inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc)
+{
+	enum CBLAS_TRANSPOSE ta = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_TRANSPOSE tb = (toupper(transb[0]) == 'N')?CblasNoTrans:CblasTrans;
+
+	cblas_dgemm(CblasColMajor, ta, tb,
+			M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);				
+}
+
 inline float SASUM(int N, float *X, int incX)
 {
 	return cblas_sasum(N, X, incX);
 }
 
+inline double DASUM(int N, double *X, int incX)
+{
+	return cblas_dasum(N, X, incX);
+}
+
 void SSCAL(int N, float alpha, float *X, int incX)
 {
 	cblas_sscal(N, alpha, X, incX);
 }
 
+void DSCAL(int N, double alpha, double *X, int incX)
+{
+	cblas_dscal(N, alpha, X, incX);
+}
+
 void STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
@@ -62,6 +83,19 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	cblas_strsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
+void DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transa_ = (toupper(transa[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_dtrsm(CblasColMajor, side_, uplo_, transa_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
 void SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
 {
@@ -81,11 +115,18 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	cblas_ssyrk(CblasColMajor, uplo_, trans_, n, k, alpha, A, lda, beta, C, ldc); 
 }
 
-void SGER (const int m, const int n, const float alpha,
+void SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
 {
-	cblas_sger(CblasRowMajor, m, n, alpha, x, incx, y, incy, A, lda);
+	cblas_sger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda)
+{
+	cblas_dger(CblasColMajor, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
 void STRSV (const char *uplo, const char *trans, const char *diag, 
@@ -112,6 +153,19 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	cblas_strmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
 }
 
+void DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb)
+{
+	enum CBLAS_SIDE side_ = (toupper(side[0]) == 'L')?CblasLeft:CblasRight;
+	enum CBLAS_UPLO uplo_ = (toupper(uplo[0]) == 'U')?CblasUpper:CblasLower;
+	enum CBLAS_TRANSPOSE transA_ = (toupper(transA[0]) == 'N')?CblasNoTrans:CblasTrans;
+	enum CBLAS_DIAG diag_ = (toupper(diag[0]) == 'N')?CblasNonUnit:CblasUnit;
+
+	cblas_dtrmm(CblasColMajor, side_, uplo_, transA_, diag_, m, n, alpha, A, lda, B, ldb);
+}
+
 void STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
@@ -128,6 +182,11 @@ void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, c
 	cblas_saxpy(n, alpha, X, incX, Y, incY);
 }
 
+void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+{
+	cblas_daxpy(n, alpha, X, incX, Y, incY);
+}
+
 int ISAMAX (const int n, float *X, const int incX)
 {
     int retVal;
@@ -135,11 +194,27 @@ int ISAMAX (const int n, float *X, const int incX)
     return retVal;
 }
 
+int IDAMAX (const int n, double *X, const int incX)
+{
+    int retVal;
+    retVal = cblas_idamax(n, X, incX);
+    return retVal;
+}
+
 float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 	return cblas_sdot(n, x, incx, y, incy);
 }
 
+void SSWAP(const int n, float *x, const int incx, float *y, const int incy)
+{
+	cblas_sswap(n, x, incx, y, incy);
+}
+
+void DSWAP(const int n, double *x, const int incx, double *y, const int incy)
+{
+	cblas_dswap(n, x, incx, y, incy);
+}
 
 #elif defined(GOTO) || defined(SYSTEM_BLAS)
 
@@ -152,16 +227,35 @@ inline void SGEMM(char *transa, char *transb, int M, int N, int K,
 			 &beta, C, &ldc);	
 }
 
+inline void DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc)
+{
+	dgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
 inline float SASUM(int N, float *X, int incX)
 {
 	return sasum_(&N, X, &incX);
 }
 
+inline double DASUM(int N, double *X, int incX)
+{
+	return dasum_(&N, X, &incX);
+}
+
 void SSCAL(int N, float alpha, float *X, int incX)
 {
 	sscal_(&N, &alpha, X, &incX);
 }
 
+void DSCAL(int N, double alpha, double *X, int incX)
+{
+	dscal_(&N, &alpha, X, &incX);
+}
+
 void STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
@@ -170,6 +264,14 @@ void STRSM (const char *side, const char *uplo, const char *transa,
 	strsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
+void DTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const double alpha, const double *A, const int lda,
+                   double *B, const int ldb)
+{
+	dtrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
 void SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda)
 {
@@ -184,13 +286,19 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 	ssyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
 }
 
-void SGER (const int m, const int n, const float alpha,
+void SGER(const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda)
 {
 	sger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
 }
 
+void DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda)
+{
+	dger_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
 
 void STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
@@ -207,6 +315,14 @@ void STRMM(const char *side, const char *uplo, const char *transA,
 	strmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
 }
 
+void DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb)
+{
+	dtrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
 void STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX)
@@ -219,6 +335,11 @@ void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, c
 	saxpy_(&n, &alpha, X, &incX, Y, &incY);
 }
 
+void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY)
+{
+	daxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
 int ISAMAX (const int n, float *X, const int incX)
 {
     int retVal;
@@ -226,6 +347,13 @@ int ISAMAX (const int n, float *X, const int incX)
     return retVal;
 }
 
+int IDAMAX (const int n, double *X, const int incX)
+{
+    int retVal;
+    retVal = idamax_ (&n, X, &incX);
+    return retVal;
+}
+
 float SDOT(const int n, const float *x, const int incx, const float *y, const int incy)
 {
 	float retVal = 0;
@@ -236,6 +364,17 @@ float SDOT(const int n, const float *x, const int incx, const float *y, const in
 	return retVal;
 }
 
+void SSWAP(const int n, float *X, const int incX, float *Y, const int incY)
+{
+	sswap_(&n, X, &incX, Y, &incY);
+}
+
+void DSWAP(const int n, double *X, const int incX, double *Y, const int incY)
+{
+	dswap_(&n, X, &incX, Y, &incY);
+}
+
+
 #else
 #error "no BLAS lib available..."
 #endif

+ 43 - 0
examples/common/blas.h

@@ -25,12 +25,19 @@
 
 void SGEMM(char *transa, char *transb, int M, int N, int K, float alpha, float *A, int lda, 
 		float *B, int ldb, float beta, float *C, int ldc);
+void DGEMM(char *transa, char *transb, int M, int N, int K, double alpha, double *A, int lda, 
+		double *B, int ldb, double beta, double *C, int ldc);
 float SASUM(int N, float *X, int incX);
+double DASUM(int N, double *X, int incX);
 void SSCAL(int N, float alpha, float *X, int incX);
+void DSCAL(int N, double alpha, double *X, int incX);
 void STRSM (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int m, const int n,
                    const float alpha, const float *A, const int lda,
                    float *B, const int ldb);
+void DGEMM(char *transa, char *transb, int M, int N, int K, 
+			double alpha, double *A, int lda, double *B, int ldb, 
+			double beta, double *C, int ldc);
 void SSYR (const char *uplo, const int n, const float alpha,
                   const float *x, const int incx, float *A, const int lda);
 void SSYRK (const char *uplo, const char *trans, const int n,
@@ -40,6 +47,9 @@ void SSYRK (const char *uplo, const char *trans, const int n,
 void SGER (const int m, const int n, const float alpha,
                   const float *x, const int incx, const float *y,
                   const int incy, float *A, const int lda);
+void DGER(const int m, const int n, const double alpha,
+                  const double *x, const int incx, const double *y,
+                  const int incy, double *A, const int lda);
 void STRSV (const char *uplo, const char *trans, const char *diag, 
                    const int n, const float *A, const int lda, float *x, 
                    const int incx);
@@ -47,12 +57,20 @@ void STRMM(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int m, const int n,
                  const float alpha, const float *A, const int lda,
                  float *B, const int ldb);
+void DTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
 void STRMV(const char *uplo, const char *transA, const char *diag,
                  const int n, const float *A, const int lda, float *X,
                  const int incX);
 void SAXPY(const int n, const float alpha, float *X, const int incX, float *Y, const int incy);
+void DAXPY(const int n, const double alpha, double *X, const int incX, double *Y, const int incY);
 int ISAMAX (const int n, float *X, const int incX);
+int IDAMAX (const int n, double *X, const int incX);
 float SDOT(const int n, const float *x, const int incx, const float *y, const int incy);
+void SSWAP(const int n, float *x, const int incx, float *y, const int incy);
+void DSWAP(const int n, double *x, const int incx, double *y, const int incy);
 
 #if defined(GOTO) || defined(SYSTEM_BLAS)
 
@@ -61,6 +79,11 @@ extern void sgemm_ (const char *transa, const char *transb, const int *m,
                    const float *A, const int *lda, const float *B, 
                    const int *ldb, const float *beta, float *C, 
                    const int *ldc);
+extern void dgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const double *alpha, 
+                   const double *A, const int *lda, const double *B, 
+                   const int *ldb, const double *beta, double *C, 
+                   const int *ldc);
 extern void ssyr_ (const char *uplo, const int *n, const float *alpha,
                   const float *x, const int *incx, float *A, const int *lda);
 extern void ssyrk_ (const char *uplo, const char *trans, const int *n,
@@ -71,12 +94,22 @@ extern void strsm_ (const char *side, const char *uplo, const char *transa,
                    const char *diag, const int *m, const int *n,
                    const float *alpha, const float *A, const int *lda,
                    float *B, const int *ldb);
+extern void dtrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const double *alpha, const double *A, const int *lda,
+                   double *B, const int *ldb);
 extern double sasum_ (const int *n, const float *x, const int *incx);
+extern double dasum_ (const int *n, const double *x, const int *incx);
 extern void sscal_ (const int *n, const float *alpha, float *x,
                    const int *incx);
+extern void dscal_ (const int *n, const double *alpha, double *x,
+                   const int *incx);
 extern void sger_(const int *m, const int *n, const float *alpha,
                   const float *x, const int *incx, const float *y,
                   const int *incy, float *A, const int *lda);
+extern void dger_(const int *m, const int *n, const double *alpha,
+                  const double *x, const int *incx, const double *y,
+                  const int *incy, double *A, const int *lda);
 extern void strsv_ (const char *uplo, const char *trans, const char *diag, 
                    const int *n, const float *A, const int *lda, float *x, 
                    const int *incx);
@@ -84,14 +117,24 @@ extern void strmm_(const char *side, const char *uplo, const char *transA,
                  const char *diag, const int *m, const int *n,
                  const float *alpha, const float *A, const int *lda,
                  float *B, const int *ldb);
+extern void dtrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const double *alpha, const double *A, const int *lda,
+                 double *B, const int *ldb);
 extern void strmv_(const char *uplo, const char *transA, const char *diag,
                  const int *n, const float *A, const int *lda, float *X,
                  const int *incX);
 extern void saxpy_(const int *n, const float *alpha, float *X, const int *incX,
 		float *Y, const int *incy);
+extern void daxpy_(const int *n, const double *alpha, double *X, const int *incX,
+		double *Y, const int *incy);
 extern int isamax_(const int *n, float *X, const int *incX);
+extern int idamax_(const int *n, double *X, const int *incX);
 /* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
 extern double sdot_(const int *n, const float *x, const int *incx, const float *y, const int *incy);
+extern void sswap_(const int *n, float *x, const int *incx, float *y, const int *incy);
+extern void dswap_(const int *n, double *x, const int *incx, double *y, const int *incy);
+
 #endif
 
 #endif // __BLAS_H__

+ 30 - 20
examples/heat/dw_factolu.c

@@ -28,12 +28,14 @@ static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 static unsigned finished = 0;
 
+static unsigned no_prio = 0;
+
 static starpu_codelet cl11 =
 {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u11,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u11,
+	.cuda_func = dw_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
 	.model = &model_11
@@ -41,10 +43,10 @@ static starpu_codelet cl11 =
 
 static starpu_codelet cl12 =
 {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u12,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u12,
+	.cuda_func = dw_cublas_codelet_update_u12,
 #endif
 	.nbuffers = 2,
 	.model = &model_12
@@ -52,10 +54,10 @@ static starpu_codelet cl12 =
 
 static starpu_codelet cl21 =
 {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u21,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u21,
+	.cuda_func = dw_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
 	.model = &model_21
@@ -63,10 +65,10 @@ static starpu_codelet cl21 =
 
 static starpu_codelet cl22 =
 {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u22,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u22,
+	.cuda_func = dw_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,
 	.model = &model_22
@@ -112,7 +114,9 @@ void dw_callback_v2_codelet_update_u22(void *argcb)
 		u11arg->nblocks = args->nblocks;
 
 		/* schedule the codelet */
-		task->priority = MAX_PRIO;
+		if (!no_prio)
+			task->priority = MAX_PRIO;
+
 		starpu_submit_task(task);
 	}
 
@@ -237,7 +241,7 @@ void dw_callback_v2_codelet_update_u12(void *argcb)
 				task22->buffers[2].mode = STARPU_RW;
 				
 				/* schedule that codelet */
-				if (slicey == i+1) 
+				if (!no_prio && (slicey == i+1))
 					task22->priority = MAX_PRIO;
 
 				starpu_submit_task(task22);
@@ -296,7 +300,7 @@ void dw_callback_v2_codelet_update_u21(void *argcb)
 				task22->buffers[2].mode = STARPU_RW;
 				
 				/* schedule that codelet */
-				if (slicex == i+1)
+				if (!no_prio && (slicex == i+1))
 					task22->priority = MAX_PRIO;
 
 				starpu_submit_task(task22);
@@ -363,7 +367,7 @@ void dw_callback_v2_codelet_update_u11(void *argcb)
 					task12->buffers[1].handle = get_sub_data(args->dataA, 2, u12a->k, u12a->i); 
 					task12->buffers[1].mode = STARPU_RW;
 
-					if (slice == i +1) 
+					if (!no_prio && (slice == i +1))
 						task12->priority = MAX_PRIO;
 
 					starpu_submit_task(task12);
@@ -400,7 +404,7 @@ void dw_callback_v2_codelet_update_u11(void *argcb)
 					task21->buffers[1].handle = get_sub_data(args->dataA, 2, u21a->i, u21a->k);
 					task21->buffers[1].mode = STARPU_RW;
 		
-					if (slice == i +1)
+					if (!no_prio && (slice == i +1))
 						task21->priority = MAX_PRIO;
 
 					starpu_submit_task(task21);
@@ -689,30 +693,36 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
 	timing_init();
 
+	starpu_helper_init_cublas();
+
 	if (pinned)
 	{
-		starpu_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
-		starpu_malloc_pinned_if_possible((void **)B, dim*sizeof(float));
+		starpu_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
+		starpu_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
 	} 
 	else {
-		*A = malloc(dim*dim*sizeof(float));
-		*B = malloc(dim*sizeof(float));
+		*A = malloc((size_t)dim*dim*sizeof(float));
+		STARPU_ASSERT(*A);
+		*B = malloc((size_t)dim*sizeof(float));
+		STARPU_ASSERT(*B);
 	}
 }
 
 void dw_factoLU(float *matA, unsigned size, 
 		unsigned ld, unsigned nblocks, 
-		unsigned version)
+		unsigned version, unsigned _no_prio)
 {
 
 #ifdef CHECK_RESULTS
 	fprintf(stderr, "Checking results ...\n");
 	float *Asaved;
-	Asaved = malloc(ld*ld*sizeof(float));
+	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
-	memcpy(Asaved, matA, ld*ld*sizeof(float));
+	memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
 #endif
 
+	no_prio = _no_prio;
+
 	starpu_data_handle dataA;
 
 	/* monitor and partition the A matrix into blocks :

+ 2 - 0
examples/heat/dw_factolu.h

@@ -20,10 +20,12 @@
 #include <semaphore.h>
 #include <string.h>
 #include <math.h>
+#include <sys/time.h>
 /* for USE_CUDA */
 #include <starpu_config.h>
 #ifdef USE_CUDA
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <cublas.h>
 #endif
 

+ 340 - 0
examples/heat/dw_factolu_grain.c

@@ -0,0 +1,340 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_factolu.h"
+
+#define TAG11(k, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  (1ULL<<56) | (unsigned long long)(k)))
+#define TAG12(k,i, prefix)	((starpu_tag_t)((((unsigned long long)(prefix))<<60)  | ((2ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21(k,j, prefix)	((starpu_tag_t)( (((unsigned long long)(prefix))<<60)  |  ((3ULL<<56) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j, prefix)	((starpu_tag_t)(  (((unsigned long long)(prefix))<<60)  |  ((4ULL<<56) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+/*
+ *	Construct the DAG
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+static starpu_codelet cl11 = {
+	.where = CORE|CUDA,
+	.core_func = dw_core_codelet_update_u11,
+#ifdef USE_CUDA
+	.cuda_func = dw_cublas_codelet_update_u11,
+#endif
+	.nbuffers = 1,
+	.model = &model_11
+};
+
+static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k, unsigned tag_prefix)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11(k, tag_prefix));
+
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k, tag_prefix), 1, TAG22(k-1, k, k, tag_prefix));
+	}
+
+	return task;
+}
+
+static starpu_codelet cl12 = {
+	.where = CORE|CUDA,
+	.core_func = dw_core_codelet_update_u12,
+#ifdef USE_CUDA
+	.cuda_func = dw_cublas_codelet_update_u12,
+#endif
+	.nbuffers = 2,
+	.model = &model_12
+};
+
+static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i, unsigned tag_prefix)
+{
+//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+
+	struct starpu_task *task = create_task(TAG12(k, i, tag_prefix));
+	
+	task->cl = &cl12;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, i, k); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (i == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, i, tag_prefix), 2, TAG11(k, tag_prefix), TAG22(k-1, i, k, tag_prefix));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, i, tag_prefix), 1, TAG11(k, tag_prefix));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl21 = {
+	.where = CORE|CUDA,
+	.core_func = dw_core_codelet_update_u21,
+#ifdef USE_CUDA
+	.cuda_func = dw_cublas_codelet_update_u21,
+#endif
+	.nbuffers = 2,
+	.model = &model_21
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j, unsigned tag_prefix)
+{
+	struct starpu_task *task = create_task(TAG21(k, j, tag_prefix));
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (j == k+1) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, j, tag_prefix), 2, TAG11(k, tag_prefix), TAG22(k-1, k, j, tag_prefix));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, j, tag_prefix), 1, TAG11(k, tag_prefix));
+	}
+
+	starpu_submit_task(task);
+}
+
+static starpu_codelet cl22 = {
+	.where = CORE|CUDA,
+	.core_func = dw_core_codelet_update_u22,
+#ifdef USE_CUDA
+	.cuda_func = dw_cublas_codelet_update_u22,
+#endif
+	.nbuffers = 3,
+	.model = &model_22
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j, unsigned tag_prefix)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j, tag_prefix));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, i, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = get_sub_data(dataA, 2, i, j); 
+	task->buffers[2].mode = STARPU_RW;
+
+	if ( (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j, tag_prefix), 3, TAG22(k-1, i, j, tag_prefix), TAG12(k, i, tag_prefix), TAG21(k, j, tag_prefix));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j, tag_prefix), 2, TAG12(k, i, tag_prefix), TAG21(k, j, tag_prefix));
+	}
+
+	starpu_submit_task(task);
+}
+
+static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_size,
+				unsigned ld, unsigned blocksize, unsigned tag_prefix)
+{
+	/*
+	 * (re)partition data
+	 */
+	starpu_data_handle dataA;
+	starpu_register_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(float));
+
+	STARPU_ASSERT((size % blocksize) == 0);
+	STARPU_ASSERT((inner_size % blocksize) == 0);
+
+	unsigned nblocks = size / blocksize;
+	unsigned maxk = inner_size / blocksize;
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+
+	/*
+	 * submit tasks
+	 */
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	/* if maxk < nblocks we'll stop before the LU decomposition is totally done */
+	for (k = 0; k < maxk; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k, tag_prefix);
+
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataA, k, i, tag_prefix);
+			create_task_21(dataA, k, i, tag_prefix);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataA, k, i, j, tag_prefix);
+			}
+		}
+	}
+
+	int ret = starpu_submit_task(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		fprintf(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+	/* is this the last call to dw_factoLU_grain_inner ? */
+	if (inner_size == size)
+	{
+		/* we wait for the last task and we are done */
+		starpu_tag_wait(TAG11(nblocks-1, tag_prefix));
+		starpu_unpartition_data(dataA, 0);		
+		return;
+	}
+	else {
+		/*
+		 * call dw_factoLU_grain_inner recursively in the remaining blocks
+		 */
+
+		unsigned ndeps_tags = (nblocks - maxk)*(nblocks - maxk);
+		starpu_tag_t *tag_array = malloc(ndeps_tags*sizeof(starpu_tag_t));
+		STARPU_ASSERT(tag_array);
+
+		unsigned ind = 0;
+		for (i = maxk; i < nblocks; i++)
+		for (j = maxk; j < nblocks; j++)
+		{
+			tag_array[ind++] = TAG22(maxk-1, i, j, tag_prefix);
+		}
+
+		starpu_tag_wait_array(ndeps_tags, tag_array);
+
+		free(tag_array);
+
+		starpu_unpartition_data(dataA, 0);
+		starpu_delete_data(dataA);
+
+		float *newmatA = &matA[inner_size*(ld+1)];
+
+//		if (tag_prefix < 2)
+//		{
+//			dw_factoLU_grain_inner(newmatA, size-inner_size, (size-inner_size)/2, ld, blocksize/2, tag_prefix+1);
+//		}
+//		else {
+			dw_factoLU_grain_inner(newmatA, size-inner_size, size-inner_size, ld, blocksize/2, tag_prefix+1);
+//		}
+	}
+
+}
+
+void dw_factoLU_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
+{
+
+#ifdef CHECK_RESULTS
+	fprintf(stderr, "Checking results ...\n");
+	float *Asaved;
+	Asaved = malloc(ld*ld*sizeof(float));
+
+	memcpy(Asaved, matA, ld*ld*sizeof(float));
+#endif
+
+	struct timeval start;
+	struct timeval end;
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+
+	/* that's only ok for powers of 2 yet ! */
+	dw_factoLU_grain_inner(matA, size, (size/nblocks) * nbigblocks, ld, size/nblocks, 0);
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = size;
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+#ifdef CHECK_RESULTS
+	compare_A_LU(Asaved, matA, size, ld);
+#endif
+}

+ 117 - 28
examples/heat/dw_factolu_kernels.c

@@ -16,23 +16,86 @@
 
 #include "dw_factolu.h"
 
-unsigned count_11_core = 0;
-unsigned count_12_core = 0;
-unsigned count_21_core = 0;
-unsigned count_22_core = 0;
+unsigned count_11_per_worker[STARPU_NMAXWORKERS] = {0};
+unsigned count_12_per_worker[STARPU_NMAXWORKERS] = {0};
+unsigned count_21_per_worker[STARPU_NMAXWORKERS] = {0};
+unsigned count_22_per_worker[STARPU_NMAXWORKERS] = {0};
 
-unsigned count_11_cublas = 0;
-unsigned count_12_cublas = 0;
-unsigned count_21_cublas = 0;
-unsigned count_22_cublas = 0;
+unsigned count_total_per_worker[STARPU_NMAXWORKERS] = {0};
+
+unsigned count_11_total = 0;
+unsigned count_12_total = 0;
+unsigned count_21_total = 0;
+unsigned count_22_total = 0;
 
 void display_stat_heat(void)
 {
+	unsigned nworkers = starpu_get_worker_count();
+
 	fprintf(stderr, "STATS : \n");
-	fprintf(stderr, "11 : core %d (%2.2f) cublas %d (%2.2f)\n", count_11_core, (100.0*count_11_core)/(count_11_core+count_11_cublas), count_11_cublas, (100.0*count_11_cublas)/(count_11_core+count_11_cublas));
-	fprintf(stderr, "12 : core %d (%2.2f) cublas %d (%2.2f)\n", count_12_core, (100.0*count_12_core)/(count_12_core+count_12_cublas), count_12_cublas, (100.0*count_12_cublas)/(count_12_core+count_12_cublas));
-	fprintf(stderr, "21 : core %d (%2.2f) cublas %d (%2.2f)\n", count_21_core, (100.0*count_21_core)/(count_21_core+count_21_cublas), count_21_cublas, (100.0*count_21_cublas)/(count_21_core+count_21_cublas));
-	fprintf(stderr, "22 : core %d (%2.2f) cublas %d (%2.2f)\n", count_22_core, (100.0*count_22_core)/(count_22_core+count_22_cublas), count_22_cublas, (100.0*count_22_cublas)/(count_22_core+count_22_cublas));
+
+	unsigned worker;
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		count_total_per_worker[worker] = count_11_per_worker[worker] 
+					+ count_12_per_worker[worker]
+					+ count_21_per_worker[worker]
+					+ count_22_per_worker[worker];
+
+		count_11_total += count_11_per_worker[worker];
+		count_12_total += count_12_per_worker[worker];
+		count_21_total += count_21_per_worker[worker];
+		count_22_total += count_22_per_worker[worker];
+	}
+
+	fprintf(stderr, "\t11 (diagonal block LU)\n");
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		if (count_total_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+			
+			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_11_per_worker[worker], count_11_total, (100.0*count_11_per_worker[worker])/count_11_total);
+		}
+	}
+
+	fprintf(stderr, "\t12 (TRSM)\n");
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		if (count_total_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+			
+			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_12_per_worker[worker], count_12_total, (100.0*count_12_per_worker[worker])/count_12_total);
+		}
+	}
+	
+	
+	fprintf(stderr, "\t21 (TRSM)\n");
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		if (count_total_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+			
+			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_21_per_worker[worker], count_21_total, (100.0*count_21_per_worker[worker])/count_21_total);
+		}
+	}
+	
+	fprintf(stderr, "\t22 (SGEMM)\n");
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		if (count_total_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+			
+			fprintf(stderr, "\t\t%s -> %d / %d (%2.2f %%)\n", name, count_22_per_worker[worker], count_22_total, (100.0*count_22_per_worker[worker])/count_22_total);
+		}
+	}
 }
 
 /*
@@ -72,6 +135,8 @@ static inline void dw_common_core_codelet_update_u22(starpu_data_interface_t *bu
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ASSERT(0);
 
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
@@ -83,14 +148,18 @@ static inline void dw_common_core_codelet_update_u22(starpu_data_interface_t *bu
 void dw_core_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_core_codelet_update_u22(descr, 0, _args);
-	(void)STARPU_ATOMIC_ADD(&count_22_core, 1);
+
+	int id = starpu_get_worker_id();
+	count_22_per_worker[id]++;
 }
 
 #ifdef USE_CUDA
 void dw_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_core_codelet_update_u22(descr, 1, _args);
-	(void)STARPU_ATOMIC_ADD(&count_22_cublas, 1);
+
+	int id = starpu_get_worker_id();
+	count_22_per_worker[id]++;
 }
 #endif// USE_CUDA
 
@@ -129,6 +198,8 @@ static inline void dw_common_codelet_update_u12(starpu_data_interface_t *buffers
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ASSERT(0);
 
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
@@ -140,14 +211,18 @@ static inline void dw_common_codelet_update_u12(starpu_data_interface_t *buffers
 void dw_core_codelet_update_u12(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_codelet_update_u12(descr, 0, _args);
-	(void)STARPU_ATOMIC_ADD(&count_12_core, 1);
+
+	int id = starpu_get_worker_id();
+	count_12_per_worker[id]++;
 }
 
 #ifdef USE_CUDA
 void dw_cublas_codelet_update_u12(starpu_data_interface_t *descr, void *_args)
 {
 	 dw_common_codelet_update_u12(descr, 1, _args);
-	(void)STARPU_ATOMIC_ADD(&count_12_cublas, 1);
+
+	int id = starpu_get_worker_id();
+	count_12_per_worker[id]++;
 }
 #endif // USE_CUDA
 
@@ -183,6 +258,8 @@ static inline void dw_common_codelet_update_u21(starpu_data_interface_t *buffers
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ASSERT(0);
 
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
@@ -193,15 +270,19 @@ static inline void dw_common_codelet_update_u21(starpu_data_interface_t *buffers
 
 void dw_core_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
 {
-	 dw_common_codelet_update_u21(descr, 0, _args);
-	(void)STARPU_ATOMIC_ADD(&count_21_core, 1);
+	dw_common_codelet_update_u21(descr, 0, _args);
+
+	int id = starpu_get_worker_id();
+	count_21_per_worker[id]++;
 }
 
 #ifdef USE_CUDA
 void dw_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_codelet_update_u21(descr, 1, _args);
-	(void)STARPU_ATOMIC_ADD(&count_21_cublas, 1);
+
+	int id = starpu_get_worker_id();
+	count_21_per_worker[id]++;
 }
 #endif 
 
@@ -216,7 +297,7 @@ static inline void debug_print(float *tab, unsigned ld, unsigned n)
 	{
 		for (i = 0; i < n; i++)
 		{
-			fprintf(stderr, "%2.2f\t", tab[j+i*ld]);
+			fprintf(stderr, "%2.2f\t", tab[(size_t)j+(size_t)i*ld]);
 		}
 		fprintf(stderr, "\n");
 	}
@@ -230,10 +311,12 @@ static inline void dw_common_codelet_update_u11(starpu_data_interface_t *descr,
 
 	sub11 = (float *)descr[0].blas.ptr; 
 
-	unsigned nx = descr[0].blas.nx;
-	unsigned ld = descr[0].blas.ld;
+	unsigned long nx = descr[0].blas.nx;
+	unsigned long ld = descr[0].blas.ld;
+
+	unsigned long z;
 
-	unsigned z;
+	float pouet;
 
 	switch (s) {
 		case 0:
@@ -256,8 +339,8 @@ static inline void dw_common_codelet_update_u11(starpu_data_interface_t *descr,
 			for (z = 0; z < nx; z++)
 			{
 				float pivot;
-				/* ok that's dirty and ridiculous ... */
-				cublasGetVector(1, sizeof(float), &sub11[z+z*ld], sizeof(float), &pivot, sizeof(float));
+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
 
 				STARPU_ASSERT(pivot != 0.0f);
 				
@@ -268,26 +351,32 @@ static inline void dw_common_codelet_update_u11(starpu_data_interface_t *descr,
 								&sub11[(z+1)+z*ld], 1,
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
+
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:
 			STARPU_ASSERT(0);
 			break;
 	}
-
 }
 
 
 void dw_core_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_codelet_update_u11(descr, 0, _args);
-	(void)STARPU_ATOMIC_ADD(&count_11_core, 1);
+
+	int id = starpu_get_worker_id();
+	count_11_per_worker[id]++;
 }
 
 #ifdef USE_CUDA
 void dw_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args)
 {
 	dw_common_codelet_update_u11(descr, 1, _args);
-	(void)STARPU_ATOMIC_ADD(&count_11_cublas, 1);
+
+	int id = starpu_get_worker_id();
+	count_11_per_worker[id]++;
 }
 #endif// USE_CUDA

+ 20 - 15
examples/heat/dw_factolu_tag.c

@@ -25,6 +25,8 @@
 					| ((unsigned long long)(i)<<16)	\
 					| (unsigned long long)(j))))
 
+static unsigned no_prio = 0;
+
 /*
  *	Construct the DAG
  */
@@ -41,10 +43,10 @@ static struct starpu_task *create_task(starpu_tag_t id)
 }
 
 static starpu_codelet cl11 = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u11,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u11,
+	.cuda_func = dw_cublas_codelet_update_u11,
 #endif
 	.nbuffers = 1,
 	.model = &model_11
@@ -63,7 +65,8 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 	task->buffers[0].mode = STARPU_RW;
 
 	/* this is an important task */
-	task->priority = MAX_PRIO;
+	if (!no_prio)
+		task->priority = MAX_PRIO;
 
 	/* enforce dependencies ... */
 	if (k > 0) {
@@ -74,10 +77,10 @@ static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
 }
 
 static starpu_codelet cl12 = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u12,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u12,
+	.cuda_func = dw_cublas_codelet_update_u12,
 #endif
 	.nbuffers = 2,
 	.model = &model_12
@@ -97,7 +100,7 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 	task->buffers[1].handle = get_sub_data(dataA, 2, i, k); 
 	task->buffers[1].mode = STARPU_RW;
 
-	if (i == k+1) {
+	if (!no_prio && (i == k+1)) {
 		task->priority = MAX_PRIO;
 	}
 
@@ -113,10 +116,10 @@ static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned i)
 }
 
 static starpu_codelet cl21 = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u21,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u21,
+	.cuda_func = dw_cublas_codelet_update_u21,
 #endif
 	.nbuffers = 2,
 	.model = &model_21
@@ -134,7 +137,7 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 	task->buffers[1].handle = get_sub_data(dataA, 2, k, j); 
 	task->buffers[1].mode = STARPU_RW;
 
-	if (j == k+1) {
+	if (!no_prio && (j == k+1)) {
 		task->priority = MAX_PRIO;
 	}
 
@@ -150,10 +153,10 @@ static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned j)
 }
 
 static starpu_codelet cl22 = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func = dw_core_codelet_update_u22,
 #ifdef USE_CUDA
-	.cublas_func = dw_cublas_codelet_update_u22,
+	.cuda_func = dw_cublas_codelet_update_u22,
 #endif
 	.nbuffers = 3,
 	.model = &model_22
@@ -175,7 +178,7 @@ static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, uns
 	task->buffers[2].handle = get_sub_data(dataA, 2, i, j); 
 	task->buffers[2].mode = STARPU_RW;
 
-	if ( (i == k + 1) && (j == k +1) ) {
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
 		task->priority = MAX_PRIO;
 	}
 
@@ -256,17 +259,19 @@ static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
 	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
 }
 
-void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned _no_prio)
 {
 
 #ifdef CHECK_RESULTS
 	fprintf(stderr, "Checking results ...\n");
 	float *Asaved;
-	Asaved = malloc(ld*ld*sizeof(float));
+	Asaved = malloc((size_t)ld*ld*sizeof(float));
 
-	memcpy(Asaved, matA, ld*ld*sizeof(float));
+	memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
 #endif
 
+	no_prio = _no_prio;
+
 	starpu_data_handle dataA;
 
 	/* monitor and partition the A matrix into blocks :

+ 13 - 78
examples/heat/dw_sparse_cg.c

@@ -20,44 +20,9 @@
 
 #include "dw_sparse_cg.h"
 
-#ifdef USE_CUDA
-/* CUDA spmv codelet */
-static struct starpu_cuda_module_s cuda_module;
-static struct starpu_cuda_function_s cuda_function;
-static starpu_cuda_codelet_t cuda_codelet;
-
-void initialize_cuda(void)
-{
-	char module_path[1024];
-	sprintf(module_path,
-		"%s/examples/cuda/spmv_cuda.cubin", STARPUDIR);
-	char *function_symbol = "spmv_kernel_3";
-
-	starpu_init_cuda_module(&cuda_module, module_path);
-	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
-
-	cuda_codelet.func = &cuda_function;
-	cuda_codelet.stack = NULL;
-	cuda_codelet.stack_size = 0; 
-
-	cuda_codelet.gridx = grids;
-	cuda_codelet.gridy = 1;
-
-	cuda_codelet.blockx = blocks;
-	cuda_codelet.blocky = 1;
-
-	cuda_codelet.shmemsize = 128;
-}
-
-
-
-
-#endif // USE_CUDA
-
 static struct starpu_task *create_task(starpu_tag_t id)
 {
 	starpu_codelet *cl = malloc(sizeof(starpu_codelet));
-		cl->where = ANY;
 		cl->model = NULL;
 
 	struct starpu_task *task = starpu_task_create();
@@ -194,9 +159,9 @@ void init_cg(struct cg_problem *problem)
 
 	/* delta_new = trans(r) r */
 	struct starpu_task *task3 = create_task(3UL);
-	task3->cl->where = CUBLAS|CORE;
+	task3->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task3->cl->cublas_func = cublas_codelet_func_3;
+	task3->cl->cuda_func = cublas_codelet_func_3;
 #endif
 	task3->cl->core_func = core_codelet_func_3;
 	task3->cl_arg = problem;
@@ -241,9 +206,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* alpha = delta_new / ( trans(d) q )*/
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
-	task5->cl->where = CUBLAS|CORE;
+	task5->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task5->cl->cublas_func = cublas_codelet_func_5;
+	task5->cl->cuda_func = cublas_codelet_func_5;
 #endif
 	task5->cl->core_func = core_codelet_func_5;
 	task5->cl_arg = problem;
@@ -257,9 +222,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* x = x + alpha d */
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
-	task6->cl->where = CUBLAS|CORE;
+	task6->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task6->cl->cublas_func = cublas_codelet_func_6;
+	task6->cl->cuda_func = cublas_codelet_func_6;
 #endif
 	task6->cl->core_func = core_codelet_func_6;
 	task6->cl_arg = problem;
@@ -273,9 +238,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* r = r - alpha q */
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
-	task7->cl->where = CUBLAS|CORE;
+	task7->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task7->cl->cublas_func = cublas_codelet_func_7;
+	task7->cl->cuda_func = cublas_codelet_func_7;
 #endif
 	task7->cl->core_func = core_codelet_func_7;
 	task7->cl_arg = problem;
@@ -289,9 +254,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* update delta_* and compute beta */
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
-	task8->cl->where = CUBLAS|CORE;
+	task8->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task8->cl->cublas_func = cublas_codelet_func_8;
+	task8->cl->cuda_func = cublas_codelet_func_8;
 #endif
 	task8->cl->core_func = core_codelet_func_8;
 	task8->cl_arg = problem;
@@ -303,9 +268,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* d = r + beta d */
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
-	task9->cl->where = CUBLAS|CORE;
+	task9->cl->where = CUDA|CORE;
 #ifdef USE_CUDA
-	task9->cl->cublas_func = cublas_codelet_func_9;
+	task9->cl->cuda_func = cublas_codelet_func_9;
 #endif
 	task9->cl->core_func = core_codelet_func_9;
 	task9->cl_arg = problem;
@@ -430,37 +395,7 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 	/* start the runtime */
 	starpu_init(NULL);
 
-
-#ifdef USE_CUDA
-	initialize_cuda();
-#endif
+	starpu_helper_init_cublas();
 
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
 }
-
-#if 0
-int main(__attribute__ ((unused)) int argc,
-	__attribute__ ((unused)) char **argv)
-{
-	parse_args(argc, argv);
-
-	timing_init();
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-
-#ifdef USE_CUDA
-	initialize_cuda();
-#endif
-
-	init_problem();
-
-	double timing = timing_delay(&start, &end);
-	fprintf(stderr, "Computation took (in ms)\n");
-	printf("%2.2f\n", timing/1000);
-
-
-	return 0;
-}
-#endif

+ 104 - 99
examples/heat/heat.c

@@ -20,13 +20,13 @@
 static unsigned ntheta = 32+2;
 static unsigned nthick = 32+2;
 static unsigned nblocks = 16;
+static unsigned nbigblocks = 8;
 static unsigned shape = 0;
 static unsigned pinned = 0;
+static unsigned check = 0;
 static unsigned version = 2;
 static unsigned use_cg = 0; /* use a LU decomposition of CG ? */
-
-static int argc_;
-static char **argv_;
+static unsigned no_prio = 0;
 
 extern void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
               		unsigned nrow, uint32_t *colind, uint32_t *rowptr);
@@ -60,6 +60,11 @@ static void parse_args(int argc, char **argv)
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 
+		if (strcmp(argv[i], "-nbigblocks") == 0) {
+		        char *argptr;
+			nbigblocks = strtol(argv[++i], &argptr, 10);
+		}
+
 		if (strcmp(argv[i], "-v1") == 0) {
 			version = 1;
 		}
@@ -72,12 +77,32 @@ static void parse_args(int argc, char **argv)
 			version = 3;
 		}
 
+		if (strcmp(argv[i], "-v4") == 0) {
+			version = 4;
+		}
+
 		if (strcmp(argv[i], "-pin") == 0) {
 			pinned = 1;
 		}
 
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-no-prio") == 0) {
+			no_prio = 1;
+		}
+
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			unsigned size = strtol(argv[++i], &argptr, 10);
+			nthick = 130;
+			ntheta = (size/128) + 2;
+			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
+		}
+
 		if (strcmp(argv[i], "-h") == 0) {
-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg]\n", argv[0]);
+			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
 		}
 	}
 }
@@ -297,44 +322,51 @@ static void solve_system(unsigned size, unsigned subsize, float *result, int *Re
         /* solve UX = X' */
 	fprintf(stderr, "Solving the problem ...\n");
 
-#ifdef CHECK_RESULTS
-	float *savedB = malloc(subsize*sizeof(float));
-	memcpy(savedB, B, subsize*sizeof(float));
-
-	float *LUB = malloc(subsize*sizeof(float));
-#endif
-
-	/* L */
-	STRSV("L", "N", "N", subsize, A, subsize, B, 1);
-
-	/* U */
-        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
-
-	STARPU_ASSERT(DIM == size);
-
-#ifdef CHECK_RESULTS
-	/* compute the error on (LUB - savedB) which should be 0 */
-
-	/* LUB = B */
-	memcpy(LUB, B, subsize*sizeof(float));
+	float *savedB;
+	float *LUB;
 
+	if (check)
+	{
+		savedB = malloc(subsize*sizeof(float));
+		memcpy(savedB, B, subsize*sizeof(float));
+		LUB = malloc(subsize*sizeof(float));
+	}
 
-	/* LUB = U * LUB */
-	STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
+		/* L */
+		STRSV("L", "N", "N", subsize, A, subsize, B, 1);
 	
-	/* LUB = L * LUB */
-	STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
-
-	/* LUB -= B */
-	SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
-
-	/* check if LUB is close to the 0 vector */
-	int maxind = ISAMAX(subsize, LUB, 1);
-	fprintf(stderr, "max (LUX - B) = %f\n",LUB[maxind - 1]);
+		/* U */
+	        STRSV("U", "N", "U", subsize, A, subsize, B, 1);
+	
+		STARPU_ASSERT(DIM == size);
+	
+	if (check)
+	{
+		/* compute the error on (LUB - savedB) which should be 0 */
+	
+		/* LUB = B */
+		memcpy(LUB, B, subsize*sizeof(float));
+	
+	
+		/* LUB = U * LUB */
+		STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
+		
+		/* LUB = L * LUB */
+		STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
+	
+		/* LUB -= B */
+		SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
+	
+		/* check if LUB is close to the 0 vector */
+		int maxind = ISAMAX(subsize, LUB, 1);
+		fprintf(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
 
-	free(LUB);
-	free(savedB);
-#endif
+		float sum = SASUM(subsize, LUB, 1);
+		fprintf(stderr,"avg. error %e\n", sum/subsize);
+	
+		free(LUB);
+		free(savedB);
+	}
 
 	/* now display back the ACTUAL result */
 	for (i = 0; i < subsize; i++)
@@ -440,7 +472,7 @@ void build_mesh(point *mesh)
 	}
 }
 
-static unsigned build_neighbour_vector(unsigned *neighbours, unsigned node, int *RefArray, int *RefArrayBack)
+static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned node, int *RefArray, int *RefArrayBack)
 {
 	/* where is that point in the former space ? */
 	int former = TRANSLATE(node);
@@ -526,9 +558,9 @@ static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bform
 	for (j = 0 ; j < newsize ; j++)
 	{
 
-		unsigned neighbour;
-		unsigned nneighbours;
-		unsigned neighbours[9];
+		unsigned long neighbour;
+		unsigned long nneighbours;
+		unsigned long neighbours[9];
 
 		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
 
@@ -560,9 +592,9 @@ static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uin
 	{
 		rowptr[j] = pos;
 
-		unsigned neighbour;
-		unsigned nneighbours;
-		unsigned neighbours[9];
+		unsigned long neighbour;
+		unsigned long nneighbours;
+		unsigned long neighbours[9];
 
 		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
 
@@ -597,7 +629,7 @@ static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uin
 
 static void build_dense_stiffness_matrix_A(point *pmesh, float *A, unsigned newsize, int *RefArray, int *RefArrayBack)
 {
-	unsigned j;
+	unsigned long j;
 
 	/* touch all the memory */
 	memset(A, 0, newsize*newsize*sizeof(float));
@@ -605,20 +637,20 @@ static void build_dense_stiffness_matrix_A(point *pmesh, float *A, unsigned news
 	/* now the actual stiffness (reordered) matrix*/
 	for (j = 0 ; j < newsize ; j++)
 	{
-		unsigned neighbour;
-		unsigned nneighbours;
-		unsigned neighbours[9];
+		unsigned long neighbour;
+		unsigned long nneighbours;
+		unsigned long neighbours[9];
 
 		nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
 
 		for (neighbour = 0; neighbour < nneighbours; neighbour++)
 		{
-			unsigned nodeneighbour =  neighbours[neighbour];
+			unsigned long nodeneighbour =  neighbours[neighbour];
 
 			if (nodeneighbour < newsize) {
 				float val;
 				val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
-				A[j+ newsize*nodeneighbour] = val;
+				A[j+ (unsigned long)newsize*nodeneighbour] = val;
 			}
 		}
 	}
@@ -635,13 +667,6 @@ int main(int argc, char **argv)
 	point *pmesh;
 	float *Bformer;
 
-	argc_ = argc;
-	argv_ = argv;
-
-#ifdef USE_MARCEL
-	marcel_init(&argc, argv);
-#endif
-
 	parse_args(argc, argv);
 
 	pmesh = malloc(DIM*sizeof(point));
@@ -652,10 +677,6 @@ int main(int argc, char **argv)
 
 	build_mesh(pmesh);
 
-#ifdef USE_POSTSCRIPT
-	postscript_gen();
-#endif
-
 	/* now simplify that problem given the boundary conditions 
 	 * to do so, we remove the already known variables from the system
 	 * by pivoting the various know variable, RefArray keep track of that
@@ -678,35 +699,6 @@ int main(int argc, char **argv)
 
 		nnz = build_sparse_stiffness_matrix_A(pmesh, &nzval, &colind, rowptr, newsize, RefArray, RefArrayBack);
 
-#if 0
-		printf("nnz : %d\n", nnz);
-
-		fprintf(stdout, "MUMPS FORMAT BEGIN\n");
-		FILE *fm = fopen("input_mumps", "w+");
-		fprintf(fm, "%d\t:N\n%d\t:NZ\n", newsize, nnz);
-
-		unsigned r;
-		for (r = 0; r < newsize; r++)
-		{
-			int first_ind = rowptr[r];
-			int last_ind = rowptr[r+1];
-
-			int ind;
-			for (ind = first_ind; ind < last_ind; ind++)
-			{
-				 fprintf(fm, "%d %d %f\n", colind[ind]+1, r+1, nzval[ind]);
-			}
-		} 
-
-		for (r = 0; r < newsize; r++)
-		{
-			fprintf(fm, "%f\n", B[r]);
-		}
-	
-		fclose(fm);
-		fprintf(stdout, "MUMPS FORMAT END\n");
-#endif		
-
 		do_conjugate_gradient(nzval, B, result, nnz, newsize, colind, rowptr);
 
 		/* XXX */
@@ -736,18 +728,31 @@ int main(int argc, char **argv)
 
 		build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
 
-		fprintf(stderr, "Problem size : %dx%d (%dx%d)\n", newsize, newsize, DIM, DIM);
-
-		if (version < 3) {
-			dw_factoLU(A, newsize, newsize, nblocks, version);
-		}
-		else {
-			dw_factoLU_tag(A, newsize, newsize, nblocks);
+		fprintf(stderr, "Problem size : %dx%d (%dx%d) (%ld MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
+
+		STARPU_ASSERT(newsize % nblocks == 0);
+
+		switch (version) {
+			case 1:
+			case 2:
+				dw_factoLU(A, newsize, newsize, nblocks, version, no_prio);
+				break;
+			case 3:
+				dw_factoLU_tag(A, newsize, newsize, nblocks, no_prio);
+				break;
+			case 4:
+				dw_factoLU_grain(A, newsize, newsize, nblocks, nbigblocks);
+				break;
+			default:
+				STARPU_ASSERT(0);
 		}
 
 		display_stat_heat();
 
-		solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
+		if (check)
+			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
+
+		starpu_helper_init_cublas();
 
 		starpu_shutdown();
 	}

+ 4 - 7
examples/heat/heat.h

@@ -27,8 +27,7 @@
 #include <starpu_config.h>
 #include <starpu.h>
 
-#include "../common/blas.h"
-
+#include <common/blas.h>
 
 #ifdef OPENGL_RENDER
 #include <GL/gl.h>
@@ -46,19 +45,17 @@
 
 #define Pi	(3.141592f)
 
-#define NODE_NUMBER(theta, thick)	((thick)+(theta)*nthick)
+#define NODE_NUMBER(theta, thick)	((unsigned long)((thick)+(theta)*nthick))
 #define NODE_TO_THICK(n)		((n) % nthick)
 #define NODE_TO_THETA(n)		((n) / nthick)
 
-//#define USE_POSTSCRIPT	1
-
 typedef struct point_t {
 	float x;
 	float y;
 } point;
 
-extern void dw_factoLU(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned version);
-extern void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks);
+extern void dw_factoLU(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned version, unsigned no_prio);
+extern void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned no_prio);
 extern void initialize_system(float **A, float **B, unsigned dim, unsigned pinned);
 
 void display_stat_heat(void);

+ 0 - 45
examples/heat/heat_display.c

@@ -236,48 +236,3 @@ void opengl_render(unsigned _ntheta, unsigned _nthick, float *_result, point *_p
 	glutMainLoop();
 }
 #endif // OPENGL_RENDER
-
-#ifdef USE_POSTSCRIPT
-static void postscript_gen(void)
-{
-	FILE *psfile;
-	psfile = fopen("output.ps", "w+");
-
-	int offx, offy;
-	unsigned theta, thick;
-
-	offx = RMAX+50;
-	offy = 100;
-
-	for (theta = 0; theta < ntheta-1; theta++)
-	{
-		for (thick = 0; thick < nthick-1; thick++)
-		{
-			fprintf(psfile, "newpath\n");
-			fprintf(psfile, "%d %d moveto\n", (int)pmesh[NODE_NUMBER(theta, thick)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta, thick)].y+ offy);
-			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta+1, thick)].y+ offy);
-			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick+1)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta+1, thick+1)].y+ offy);
-			fprintf(psfile, "closepath\n");
-			fprintf(psfile, "stroke\n");
-
-			fprintf(psfile, "newpath\n");
-			fprintf(psfile, "%d %d moveto\n", (int)pmesh[NODE_NUMBER(theta, thick)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta, thick)].y+ offy);
-			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta, thick+1)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta, thick+1)].y+ offy);
-			fprintf(psfile, "%d %d lineto\n", (int)pmesh[NODE_NUMBER(theta+1, thick+1)].x + offx,
-					(int)pmesh[NODE_NUMBER(theta+1, thick+1)].y+ offy);
-			fprintf(psfile, "closepath\n");
-
-			fprintf(psfile, "stroke\n");
-		}
-	}
-
-	fclose(psfile);
-}
-#endif
-
-

+ 21 - 102
examples/incrementer/incrementer.c

@@ -14,44 +14,17 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <string.h>
-#include <sys/types.h>
+#include <starpu.h>
 #include <pthread.h>
 
-/* for USE_CUDA */
-#include <starpu_config.h>
-#include <starpu.h>
+#define NITER	50000
 
 #ifdef USE_CUDA
-#include <cuda.h>
-#include <cublas.h>
+extern void cuda_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args);
 #endif
 
-#define NITER	50000
-
-static starpu_data_handle my_float_state;
-static starpu_data_handle unity_state;
-
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-
-unsigned size __attribute__ ((aligned (16))) = 4*sizeof(float);
-
-float my_lovely_float[4] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
-float unity[4] __attribute__ ((aligned (16))) = { 1.0f, 0.0f, 1.0f};
-
-void callback_func(void *argcb)
-{
-	unsigned cnt = STARPU_ATOMIC_ADD((unsigned *)argcb, 1);
-
-	if (cnt == NITER) 
-	{
-		pthread_mutex_lock(&mutex);
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
 
-	}
-}
+extern void cuda_codelet_host(float *tab);
 
 void core_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
 {
@@ -60,91 +33,38 @@ void core_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) voi
 	val[0] += 1.0f; val[1] += 1.0f;
 }
 
-#ifdef USE_CUDA
-void cublas_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
-{
-	float *val = (float *)buffers[0].vector.ptr;
-	float *dunity = (float *)buffers[1].vector.ptr;
-
-	cublasSaxpy(3, 1.0f, dunity, 1, val, 1);
-}
-#endif
-
-#ifdef USE_CUDA
-static struct starpu_cuda_module_s cuda_module;
-static struct starpu_cuda_function_s cuda_function;
-
-static starpu_cuda_codelet_t cuda_codelet;
-
-void initialize_cuda(void)
-{
-	char module_path[1024];
-	sprintf(module_path, 
-		"%s/examples/cuda/incrementer_cuda.cubin", STARPUDIR);
-	char *function_symbol = "cuda_incrementer";
-
-	starpu_init_cuda_module(&cuda_module, module_path);
-	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
-
-	cuda_codelet.func = &cuda_function;
-
-	cuda_codelet.gridx = 1;
-	cuda_codelet.gridy = 1;
-
-	cuda_codelet.blockx = 1;
-	cuda_codelet.blocky = 1;
-
-	cuda_codelet.shmemsize = 1024;
-}
-#endif
-
-void init_data(void)
-{
-	starpu_register_vector_data(&my_float_state, 0 /* home node */,
-			(uintptr_t)&my_lovely_float, 4, sizeof(float));
-
-	starpu_register_vector_data(&unity_state, 0 /* home node */,
-			(uintptr_t)&unity, 4, sizeof(float));
-}
-
 int main(int argc, char **argv)
 {
-	unsigned counter = 0;
-
 	starpu_init(NULL);
 
-	init_data();
+	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
 
-#ifdef USE_CUDA
-	initialize_cuda();
-#endif
+	starpu_data_handle float_array_handle;
+	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
+			(uintptr_t)&float_array, 3, sizeof(float));
 
 	starpu_codelet cl =
 	{
-		.core_func = core_codelet,
+		/* CUBLAS stands for CUDA kernels controlled from the host */
 		.where = CORE|CUDA,
+		.core_func = core_codelet,
 #ifdef USE_CUDA
-		.cuda_func = &cuda_codelet,
+		.cuda_func = cuda_codelet,
 #endif
-		.nbuffers = 2
+		.nbuffers = 1
 	};
 
 	unsigned i;
 	for (i = 0; i < NITER; i++)
 	{
 		struct starpu_task *task = starpu_task_create();
+
 		task->cl = &cl;
 		
-		task->callback_func = callback_func;
-		task->callback_arg = &counter;
-
-		task->cl_arg = &size;
-		task->cl_arg_size = sizeof(unsigned);
+		task->callback_func = NULL;
 
-		task->buffers[0].handle = my_float_state;
+		task->buffers[0].handle = float_array_handle;
 		task->buffers[0].mode = STARPU_RW;
-		task->buffers[1].handle = unity_state; 
-		task->buffers[1].mode = STARPU_R;
 
 		int ret = starpu_submit_task(task);
 		if (STARPU_UNLIKELY(ret == -ENODEV))
@@ -154,16 +74,15 @@ int main(int argc, char **argv)
 		}
 	}
 
-	pthread_mutex_lock(&mutex);
-	pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	starpu_wait_all_tasks();
 
-	starpu_sync_data_with_mem(my_float_state);
+	/* update the array in RAM */
+	starpu_sync_data_with_mem(float_array_handle);
 	
-	fprintf(stderr, "array -> %f, %f, %f\n", my_lovely_float[0], 
-			my_lovely_float[1], my_lovely_float[2]);
+	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
+			float_array[1], float_array[2]);
 	
-	if (my_lovely_float[0] != my_lovely_float[1] + my_lovely_float[2])
+	if (float_array[0] != float_array[1] + float_array[2])
 		return 1;
 	
 	starpu_shutdown();

+ 32 - 0
examples/incrementer/incrementer_kernels.cu

@@ -0,0 +1,32 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+static __global__ void cuda_incrementer(float * tab)
+{
+	tab[0] = tab[0] + 1.0;
+	tab[2] = tab[2] + 1.0;
+	
+	return;
+}
+
+extern "C" void cuda_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
+{
+	float *val = (float *)buffers[0].vector.ptr;
+
+	cuda_incrementer<<<1,1>>>(val);
+}

+ 0 - 114
examples/incrementer/incrementer_runtime.c

@@ -1,114 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-#include <pthread.h>
-
-#define NITER	50000
-
-extern void cuda_codelet_host(float *tab);
-
-static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-
-void callback_func(void *argcb)
-{
-	unsigned cnt = STARPU_ATOMIC_ADD((unsigned *)argcb, 1);
-	if (cnt == NITER) 
-	{
-		pthread_mutex_lock(&mutex);
-		pthread_cond_signal(&cond);
-		pthread_mutex_unlock(&mutex);
-	}
-}
-
-void core_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
-{
-	float *val = (float *)buffers[0].vector.ptr;
-
-	val[0] += 1.0f; val[1] += 1.0f;
-}
-
-#ifdef USE_CUDA
-void cuda_codelet(starpu_data_interface_t *buffers, __attribute__ ((unused)) void *_args)
-{
-	float *val = (float *)buffers[0].vector.ptr;
-
-	cuda_codelet_host(val);
-}
-#endif
-
-int main(int argc, char **argv)
-{
-	unsigned counter = 0;
-
-	starpu_init(NULL);
-
-	float float_array[3] __attribute__ ((aligned (16))) = { 0.0f, 0.0f, 0.0f}; 
-
-	starpu_data_handle float_array_handle;
-	starpu_register_vector_data(&float_array_handle, 0 /* home node */,
-			(uintptr_t)&float_array, 3, sizeof(float));
-
-	starpu_codelet cl =
-	{
-		/* CUBLAS stands for CUDA kernels controlled from the host */
-		.where = CORE|CUBLAS,
-		.core_func = core_codelet,
-#ifdef USE_CUDA
-		.cublas_func = cuda_codelet,
-#endif
-		.nbuffers = 1
-	};
-
-	unsigned i;
-	for (i = 0; i < NITER; i++)
-	{
-		struct starpu_task *task = starpu_task_create();
-
-		task->cl = &cl;
-		
-		task->callback_func = callback_func;
-		task->callback_arg = &counter;
-
-		task->buffers[0].handle = float_array_handle;
-		task->buffers[0].mode = STARPU_RW;
-
-		int ret = starpu_submit_task(task);
-		if (STARPU_UNLIKELY(ret == -ENODEV))
-		{
-			fprintf(stderr, "No worker may execute this task\n");
-			exit(0);
-		}
-	}
-
-	pthread_mutex_lock(&mutex);
-	pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
-
-	/* update the array in RAM */
-	starpu_sync_data_with_mem(float_array_handle);
-	
-	fprintf(stderr, "array -> %f, %f, %f\n", float_array[0], 
-			float_array[1], float_array[2]);
-	
-	if (float_array[0] != float_array[1] + float_array[2])
-		return 1;
-	
-	starpu_shutdown();
-
-	return 0;
-}

+ 18 - 0
examples/lu/dlu.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "xlu.c"

+ 18 - 0
examples/lu/dlu_kernels.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "xlu_kernels.c"

+ 2 - 12
examples/incrementer/incrementer_runtime_kernels.cu

@@ -14,15 +14,5 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-extern "C" __global__ void cuda_incrementer(float * tab)
-{
-	tab[0] = tab[0] + 1.0;
-	tab[2] = tab[2] + 1.0;
-	
-	return;
-}
-
-extern "C" void cuda_codelet_host(float *tab)
-{
-	cuda_incrementer<<<1,1>>>(tab);
-}
+#include "double.h"
+#include "xlu_pivot.c"

+ 39 - 0
examples/lu/double.h

@@ -0,0 +1,39 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE double
+
+#define STARPU_LU(name)       starpu_dlu_##name
+
+#define CUBLAS_GEMM	cublasDgemm
+#define CUBLAS_TRSM	cublasDtrsm
+#define CUBLAS_SCAL	cublasDscal
+#define CUBLAS_GER	cublasDger
+#define CUBLAS_SWAP	cublasDswap
+#define CUBLAS_IAMAX	cublasIdamax
+
+#define CPU_GEMM	DGEMM
+#define CPU_TRSM	DTRSM
+#define CPU_SCAL	DSCAL
+#define CPU_GER		DGER
+#define CPU_SWAP	DSWAP
+
+#define CPU_TRMM	DTRMM
+#define CPU_AXPY	DAXPY
+#define CPU_ASUM	DASUM
+#define CPU_IAMAX	IDAMAX
+
+#define PIVOT_THRESHHOLD	10e-10

+ 39 - 0
examples/lu/float.h

@@ -0,0 +1,39 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE float
+
+#define STARPU_LU(name)       starpu_slu_##name
+
+#define CUBLAS_GEMM	cublasSgemm
+#define CUBLAS_TRSM	cublasStrsm
+#define CUBLAS_SCAL	cublasSscal
+#define CUBLAS_GER	cublasSger
+#define CUBLAS_SWAP	cublasSswap
+#define CUBLAS_IAMAX	cublasIsamax
+
+#define CPU_GEMM	SGEMM
+#define CPU_TRSM	STRSM
+#define CPU_SCAL	SSCAL
+#define CPU_GER		SGER
+#define CPU_SWAP	SSWAP
+
+#define CPU_TRMM	STRMM
+#define CPU_AXPY	SAXPY
+#define CPU_ASUM	SASUM
+#define CPU_IAMAX	ISAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 311 - 0
examples/lu/lu_example.c

@@ -0,0 +1,311 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+
+#include "xlu.h"
+#include "xlu_kernels.h"
+
+static unsigned long size = 16384;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static unsigned pivot = 0;
+static unsigned no_stride = 0;
+
+TYPE *A, *A_saved;
+
+/* in case we use non-strided blocks */
+TYPE **A_blocks;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-piv") == 0) {
+			pivot = 1;
+		}
+
+		if (strcmp(argv[i], "-no-stride") == 0) {
+			no_stride = 1;
+		}
+
+	}
+}
+
+static void display_matrix(TYPE *m, unsigned n, unsigned ld, char *str)
+{
+#if 0
+	fprintf(stderr, "***********\n");
+	fprintf(stderr, "Display matrix %s\n", str);
+	unsigned i,j;
+	for (j = 0; j < n; j++)
+	{
+		for (i = 0; i < n; i++)
+		{
+			fprintf(stderr, "%2.2f\t", m[i+j*ld]);
+		}
+		fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "***********\n");
+#endif
+}
+
+void copy_blocks_into_matrix(void)
+{
+	unsigned blocksize = (size/nblocks);
+
+	unsigned i, j;
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		for (j = 0; j < blocksize; j++)
+		for (i = 0; i < blocksize; i++)
+		{
+			A[(i+bi*blocksize) + (j + bj*blocksize)*size] =
+				A_blocks[bi+nblocks*bj][i + j * blocksize];
+		}
+
+		//free(A_blocks[bi+nblocks*bj]);
+	}
+}
+
+
+
+void copy_matrix_into_blocks(void)
+{
+	unsigned blocksize = (size/nblocks);
+
+	unsigned i, j;
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		starpu_malloc_pinned_if_possible((void **)&A_blocks[bi+nblocks*bj], (size_t)blocksize*blocksize*sizeof(TYPE));
+
+		for (j = 0; j < blocksize; j++)
+		for (i = 0; i < blocksize; i++)
+		{
+			A_blocks[bi+nblocks*bj][i + j * blocksize] =
+			A[(i+bi*blocksize) + (j + bj*blocksize)*size];
+		}
+	}
+}
+
+static void init_matrix(void)
+{
+	/* allocate matrix */
+	starpu_malloc_pinned_if_possible((void **)&A, (size_t)size*size*sizeof(TYPE));
+	STARPU_ASSERT(A);
+
+	srand48((long int)time(NULL));
+	//srand48(0);
+
+	/* initialize matrix content */
+	unsigned long i,j;
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < size; i++)
+		{
+			A[i + j*size] = (TYPE)drand48();
+		}
+	}
+
+}
+
+static void save_matrix(void)
+{
+	A_saved = malloc((size_t)size*size*sizeof(TYPE));
+	STARPU_ASSERT(A_saved);
+
+	memcpy(A_saved, A, (size_t)size*size*sizeof(TYPE));
+}
+
+static double frobenius_norm(TYPE *v, unsigned n)
+{
+	double sum2 = 0.0;
+
+	/* compute sqrt(Sum(|x|^2)) */
+
+	unsigned i,j;
+	for (j = 0; j < n; j++)
+	for (i = 0; i < n; i++)
+	{
+		double a = fabsl((double)v[i+n*j]);
+		sum2 += a*a;
+	}
+
+	return sqrt(sum2);
+}
+
+static pivot_saved_matrix(unsigned *ipiv)
+{
+	unsigned k;
+	for (k = 0; k < size; k++)
+	{
+		if (k != ipiv[k])
+		{
+	//		fprintf(stderr, "SWAP %d and %d\n", k, ipiv[k]);
+			CPU_SWAP(size, &A_saved[k*size], 1, &A_saved[ipiv[k]*size], 1);
+		}
+	}
+}
+
+static void check_result(void)
+{
+	unsigned i,j;
+	TYPE *L, *U;
+
+	L = malloc((size_t)size*size*sizeof(TYPE));
+	U = malloc((size_t)size*size*sizeof(TYPE));
+
+	memset(L, 0, size*size*sizeof(TYPE));
+	memset(U, 0, size*size*sizeof(TYPE));
+
+	/* only keep the lower part */
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < j; i++)
+		{
+			L[j+i*size] = A[j+i*size];
+		}
+
+		/* diag i = j */
+		L[j+j*size] = A[j+j*size];
+		U[j+j*size] = 1.0;
+
+		for (i = j+1; i < size; i++)
+		{
+			U[j+i*size] = A[j+i*size];
+		}
+	}
+
+	display_matrix(L, size, size, "L");
+	display_matrix(U, size, size, "U");
+
+	/* now A_err = L, compute L*U */
+	CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+
+	display_matrix(A_saved, size, size, "P A_saved");
+	display_matrix(L, size, size, "LU");
+
+	/* compute "LU - A" in L*/
+	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
+	display_matrix(L, size, size, "Residuals");
+	
+	TYPE err = CPU_ASUM(size*size, L, 1);
+	int max = CPU_IAMAX(size*size, L, 1);
+
+	fprintf(stderr, "Avg error : %e\n", err/(size*size));
+	fprintf(stderr, "Max error : %e\n", L[max]);
+
+	double residual = frobenius_norm(L, size);
+	double matnorm = frobenius_norm(A_saved, size);
+
+	fprintf(stderr, "||%sA-LU|| / (||A||*N) : %e\n", pivot?"P":"", residual/(matnorm*size));
+
+	if (residual/(matnorm*size) > 1e-5)
+		exit(-1);
+}
+
+int main(int argc, char **argv)
+{
+	parse_args(argc, argv);
+
+	starpu_init(NULL);
+
+	starpu_helper_init_cublas();
+
+	init_matrix();
+
+	unsigned *ipiv;
+	if (check)
+		save_matrix();
+
+	display_matrix(A, size, size, "A");
+
+	/* Factorize the matrix (in place) */
+	if (pivot)
+	{
+ 		ipiv = malloc(size*sizeof(unsigned));
+		if (no_stride)
+		{
+			/* in case the LU decomposition uses non-strided blocks, we _copy_ the matrix into smaller blocks */
+			A_blocks = malloc(nblocks*nblocks*sizeof(TYPE **));
+			copy_matrix_into_blocks();
+
+			STARPU_LU(lu_decomposition_pivot_no_stride)(A_blocks, ipiv, size, size, nblocks);
+
+			copy_blocks_into_matrix();
+			free(A_blocks);
+		}
+		else 
+		{
+			struct timeval start;
+			struct timeval end;
+
+			gettimeofday(&start, NULL);
+
+			STARPU_LU(lu_decomposition_pivot)(A, ipiv, size, size, nblocks);
+	
+			gettimeofday(&end, NULL);
+
+			double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+			
+			unsigned n = size;
+			double flop = (2.0f*n*n*n)/3.0f;
+			fprintf(stderr, "Synthetic GFlops (TOTAL) : \n");
+			fprintf(stdout, "%d	%6.2f\n", n, (flop/timing/1000.0f));
+		}
+	}
+	else
+	{
+		STARPU_LU(lu_decomposition)(A, size, size, nblocks);
+	}
+
+	if (check)
+	{
+		if (pivot)
+			pivot_saved_matrix(ipiv);
+
+		check_result();
+	}
+
+	starpu_helper_shutdown_cublas();
+
+	starpu_shutdown();
+
+	return 0;
+}

+ 18 - 0
examples/lu/lu_example_double.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "lu_example.c"

+ 18 - 0
examples/lu/lu_example_float.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "lu_example.c"

+ 18 - 0
examples/lu/slu.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu.c"

+ 18 - 0
examples/lu/slu_kernels.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu_kernels.c"

+ 18 - 0
examples/lu/slu_pivot.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "xlu_pivot.c"

+ 332 - 0
examples/lu/xlu.c

@@ -0,0 +1,332 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "xlu.h"
+#include "xlu_kernels.h"
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+
+static unsigned no_prio = 0;
+
+
+
+
+/*
+ *	Construct the DAG
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_11) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_11_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_11_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_11)
+#endif
+};
+
+static starpu_codelet cl11 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u11),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u11),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_11)
+};
+
+static struct starpu_task *create_task_11(starpu_data_handle dataA, unsigned k)
+{
+//	printf("task 11 k = %d TAG = %llx\n", k, (TAG11(k)));
+
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &cl11;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_12) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_12_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_12)
+#endif
+};
+
+static starpu_codelet cl12 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u12),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u12),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_12)
+};
+
+static void create_task_12(starpu_data_handle dataA, unsigned k, unsigned j)
+{
+//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+
+	struct starpu_task *task = create_task(TAG12(k, j));
+	
+	task->cl = &cl12;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, j, k); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, j), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_21) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_21_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_21)
+#endif
+};
+
+static starpu_codelet cl21 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u21),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u21),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_21)
+};
+
+static void create_task_21(starpu_data_handle dataA, unsigned k, unsigned i)
+{
+	struct starpu_task *task = create_task(TAG21(k, i));
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, k, i); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, i), 2, TAG11(k), TAG22(k-1, i, k));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, i), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_22) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_22_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_22)
+#endif
+};
+
+static starpu_codelet cl22 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u22),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u22),
+#endif
+	.nbuffers = 3,
+	.model = &STARPU_LU(model_22)
+};
+
+static void create_task_22(starpu_data_handle dataA, unsigned k, unsigned i, unsigned j)
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_sub_data(dataA, 2, k, i); /* produced by TAG21(k, i) */ 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_sub_data(dataA, 2, j, k); /* produced by TAG12(k, j) */
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = get_sub_data(dataA, 2, j, i); /* produced by TAG22(k-1, i, j) */
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG12(k, j), TAG21(k, i));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG12(k, j), TAG21(k, i));
+	}
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+static void dw_codelet_facto_v3(starpu_data_handle dataA, unsigned nblocks)
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11(dataA, k);
+
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+		
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataA, k, i);
+			create_task_21(dataA, k, i);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataA, k, i, j);
+			}
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	int ret = starpu_submit_task(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		fprintf(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait(TAG11(nblocks-1));
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	fprintf(stderr, "Computation took (in ms)\n");
+	printf("%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+}
+
+void STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_register_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	dw_codelet_facto_v3(dataA, nblocks);
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+}

+ 109 - 0
examples/lu/xlu.h

@@ -0,0 +1,109 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __XLU_H__
+#define __XLU_H__
+
+/* for USE_CUDA */
+#include <starpu_config.h>
+#include <starpu.h>
+
+#include <common/blas.h>
+
+#define BLAS3_FLOP(n1,n2,n3)    \
+        (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
+
+#ifdef CHECK_RESULTS
+static void __attribute__ ((unused)) compare_A_LU(float *A, float *LU,
+				unsigned size, unsigned ld)
+{
+	unsigned i,j;
+	float *L;
+	float *U;
+
+	L = malloc(size*size*sizeof(float));
+	U = malloc(size*size*sizeof(float));
+
+	memset(L, 0, size*size*sizeof(float));
+	memset(U, 0, size*size*sizeof(float));
+
+	/* only keep the lower part */
+	for (j = 0; j < size; j++)
+	{
+		for (i = 0; i < j; i++)
+		{
+			L[j+i*size] = LU[j+i*ld];
+		}
+
+		/* diag i = j */
+		L[j+j*size] = LU[j+j*ld];
+		U[j+j*size] = 1.0f;
+
+		for (i = j+1; i < size; i++)
+		{
+			U[j+i*size] = LU[j+i*ld];
+		}
+	}
+
+        /* now A_err = L, compute L*U */
+	STRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+
+	float max_err = 0.0f;
+	for (i = 0; i < size ; i++)
+	{
+		for (j = 0; j < size; j++) 
+		{
+			max_err = STARPU_MAX(max_err, fabs(  L[j+i*size] - A[j+i*ld]  ));
+		}
+	}
+
+	printf("max error between A and L*U = %f \n", max_err);
+}
+#endif // CHECK_RESULTS
+
+void dw_core_codelet_update_u11(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u12(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u21(starpu_data_interface_t *, void *);
+void dw_core_codelet_update_u22(starpu_data_interface_t *, void *);
+
+#ifdef USE_CUDA
+void dw_cublas_codelet_update_u11(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u12(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u21(starpu_data_interface_t *descr, void *_args);
+void dw_cublas_codelet_update_u22(starpu_data_interface_t *descr, void *_args);
+#endif
+
+void dw_callback_codelet_update_u11(void *);
+void dw_callback_codelet_update_u12_21(void *);
+void dw_callback_codelet_update_u22(void *);
+
+void dw_callback_v2_codelet_update_u11(void *);
+void dw_callback_v2_codelet_update_u12(void *);
+void dw_callback_v2_codelet_update_u21(void *);
+void dw_callback_v2_codelet_update_u22(void *);
+
+extern struct starpu_perfmodel_t model_11;
+extern struct starpu_perfmodel_t model_12;
+extern struct starpu_perfmodel_t model_21;
+extern struct starpu_perfmodel_t model_22;
+
+struct piv_s {
+	unsigned *piv; /* complete pivot array */
+	unsigned first; /* first element */
+	unsigned last; /* last element */
+};
+
+#endif // __XLU_H__

+ 457 - 0
examples/lu/xlu_kernels.c

@@ -0,0 +1,457 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "xlu.h"
+#include <math.h>
+
+/*
+ *   U22 
+ */
+
+static inline void STARPU_LU(common_u22)(starpu_data_interface_t *buffers,
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *right 	= (TYPE *)buffers[0].blas.ptr;
+	TYPE *left 	= (TYPE *)buffers[1].blas.ptr;
+	TYPE *center 	= (TYPE *)buffers[2].blas.ptr;
+
+	unsigned dx = buffers[2].blas.nx;
+	unsigned dy = buffers[2].blas.ny;
+	unsigned dz = buffers[0].blas.ny;
+
+	unsigned ld12 = buffers[0].blas.ld;
+	unsigned ld21 = buffers[1].blas.ld;
+	unsigned ld22 = buffers[2].blas.ld;
+
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_GEMM("N", "N", dy, dx, dz, 
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0, center, ld22);
+			break;
+
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_GEMM('n', 'n', dx, dy, dz,
+				(TYPE)-1.0, right, ld21, left, ld12,
+				(TYPE)1.0f, center, ld22);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_ASSERT(0);
+
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+				CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_u22)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u22)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_u22)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u22)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+/*
+ * U12
+ */
+
+static inline void STARPU_LU(common_u12)(starpu_data_interface_t *buffers,
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub12;
+
+	sub11 = (TYPE *)buffers[0].blas.ptr;	
+	sub12 = (TYPE *)buffers[1].blas.ptr;
+
+	unsigned ld11 = buffers[0].blas.ld;
+	unsigned ld12 = buffers[1].blas.ld;
+
+	unsigned nx12 = buffers[1].blas.nx;
+	unsigned ny12 = buffers[1].blas.ny;
+
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	/* solve L11 U12 = A12 (find U12) */
+	switch (s) {
+		case 0:
+			CPU_TRSM("L", "L", "N", "N", nx12, ny12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
+					(TYPE)1.0, sub11, ld11, sub12, ld12);
+
+			status = cublasGetError();
+			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
+				STARPU_ASSERT(0);
+
+			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+				CUDA_REPORT_ERROR(cures);
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_u12)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u12)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_u12)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u12)(descr, 1, _args);
+}
+#endif // USE_CUDA
+
+/* 
+ * U21
+ */
+
+static inline void STARPU_LU(common_u21)(starpu_data_interface_t *buffers,
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+	TYPE *sub21;
+
+	sub11 = (TYPE *)buffers[0].blas.ptr;
+	sub21 = (TYPE *)buffers[1].blas.ptr;
+
+	unsigned ld11 = buffers[0].blas.ld;
+	unsigned ld21 = buffers[1].blas.ld;
+
+	unsigned nx21 = buffers[1].blas.nx;
+	unsigned ny21 = buffers[1].blas.ny;
+	
+#ifdef USE_CUDA
+	cublasStatus status;
+	cudaError_t cures;
+#endif
+
+	switch (s) {
+		case 0:
+			CPU_TRSM("R", "U", "N", "U", nx21, ny21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+			break;
+#ifdef USE_CUDA
+		case 1:
+			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
+					(TYPE)1.0, sub11, ld11, sub21, ld21);
+
+			status = cublasGetError();
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_ASSERT(0);
+
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_u21)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u21)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_u21)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u21)(descr, 1, _args);
+}
+#endif 
+
+/*
+ *	U11
+ */
+
+static inline void STARPU_LU(common_u11)(starpu_data_interface_t *descr,
+				int s, __attribute__((unused)) void *_args)
+{
+	TYPE *sub11;
+
+	sub11 = (TYPE *)descr[0].blas.ptr; 
+
+	unsigned long nx = descr[0].blas.nx;
+	unsigned long ld = descr[0].blas.ld;
+
+	unsigned long z;
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				pivot = sub11[z+z*ld];
+				STARPU_ASSERT(pivot != 0.0);
+		
+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
+		
+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
+
+				STARPU_ASSERT(pivot != 0.0);
+				
+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				
+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+			
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_u11)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u11)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_u11)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u11)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+/*
+ *	U11 with pivoting
+ */
+
+static inline void STARPU_LU(common_u11_pivot)(starpu_data_interface_t *descr,
+				int s, void *_args)
+{
+	TYPE *sub11;
+
+	sub11 = (TYPE *)descr[0].blas.ptr; 
+
+	unsigned long nx = descr[0].blas.nx;
+	unsigned long ld = descr[0].blas.ld;
+
+	unsigned long z;
+
+	struct piv_s *piv = _args;
+	unsigned *ipiv = piv->piv;
+	unsigned first = piv->first;
+
+	int i,j;
+
+	switch (s) {
+		case 0:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				pivot = sub11[z+z*ld];
+
+				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
+				{
+
+					/* find the pivot */
+					int piv_ind = CPU_IAMAX(nx - z, &sub11[z*(ld+1)], ld);
+
+					ipiv[z + first] = piv_ind + z + first;
+
+					/* swap if needed */
+					if (piv_ind != 0)
+					{
+						CPU_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
+					}
+
+					pivot = sub11[z+z*ld];
+				}
+			
+				STARPU_ASSERT(pivot != 0.0);
+
+				CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
+		
+				CPU_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+			}
+
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (z = 0; z < nx; z++)
+			{
+				TYPE pivot;
+				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
+				cudaStreamSynchronize(0);
+
+				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
+				{
+					/* find the pivot */
+					int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
+	
+					ipiv[z + first] = piv_ind + z + first;
+
+					/* swap if needed */
+					if (piv_ind != 0)
+					{
+						CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
+					}
+
+					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
+					cudaStreamSynchronize(0);
+				}
+
+				STARPU_ASSERT(pivot != 0.0);
+				
+				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				
+				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
+						&sub11[(z+1)+z*ld], 1,
+						&sub11[z+(z+1)*ld], ld,
+						&sub11[(z+1) + (z+1)*ld],ld);
+				
+			}
+
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_u11_pivot)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u11_pivot)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_u11_pivot)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_u11_pivot)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+/*
+ *	Pivoting
+ */
+
+static inline void STARPU_LU(common_pivot)(starpu_data_interface_t *descr,
+				int s, void *_args)
+{
+	TYPE *matrix;
+
+	matrix = (TYPE *)descr[0].blas.ptr; 
+	unsigned long nx = descr[0].blas.nx;
+	unsigned long ld = descr[0].blas.ld;
+
+	unsigned row, rowaux;
+
+	struct piv_s *piv = _args;
+	unsigned *ipiv = piv->piv;
+	unsigned first = piv->first;
+	unsigned last = piv->last;
+
+	switch (s) {
+		case 0:
+			for (row = 0; row < nx; row++)
+			{
+				unsigned rowpiv = ipiv[row+first] - first;
+				if (rowpiv != row)
+				{
+					CPU_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
+				}
+			}
+			break;
+#ifdef USE_CUDA
+		case 1:
+			for (row = 0; row < nx; row++)
+			{
+				unsigned rowpiv = ipiv[row+first] - first;
+				if (rowpiv != row)
+				{
+					CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
+				}
+			}
+
+			cudaThreadSynchronize();
+
+			break;
+#endif
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+}
+
+void STARPU_LU(cpu_pivot)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_pivot)(descr, 0, _args);
+}
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_pivot)(starpu_data_interface_t *descr, void *_args)
+{
+	STARPU_LU(common_pivot)(descr, 1, _args);
+}
+#endif// USE_CUDA
+
+

+ 42 - 0
examples/lu/xlu_kernels.h

@@ -0,0 +1,42 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __XLU_KERNELS_H__
+#define __XLU_KERNELS_H__
+
+#include <starpu.h>
+
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
+
+void STARPU_LU(cpu_pivot)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cpu_u11_pivot)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cpu_u11)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cpu_u12)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cpu_u21)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cpu_u22)(starpu_data_interface_t *descr, void *_args);
+
+#ifdef USE_CUDA
+void STARPU_LU(cublas_pivot)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cublas_u11_pivot)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cublas_u11)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cublas_u12)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cublas_u21)(starpu_data_interface_t *descr, void *_args);
+void STARPU_LU(cublas_u22)(starpu_data_interface_t *descr, void *_args);
+#endif
+
+#endif // __XLU_KERNELS_H__

+ 526 - 0
examples/lu/xlu_pivot.c

@@ -0,0 +1,526 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "xlu.h"
+#include "xlu_kernels.h"
+
+#define TAG11(k)	((starpu_tag_t)( (1ULL<<60) | (unsigned long long)(k)))
+#define TAG12(k,i)	((starpu_tag_t)(((2ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+#define TAG21(k,j)	((starpu_tag_t)(((3ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(j))))
+#define TAG22(k,i,j)	((starpu_tag_t)(((4ULL<<60) | ((unsigned long long)(k)<<32) 	\
+					| ((unsigned long long)(i)<<16)	\
+					| (unsigned long long)(j))))
+#define PIVOT(k,i)	((starpu_tag_t)(((5ULL<<60) | (((unsigned long long)(k))<<32)	\
+					| (unsigned long long)(i))))
+
+static unsigned no_prio = 0;
+
+
+
+
+/*
+ *	Construct the DAG
+ */
+
+static struct starpu_task *create_task(starpu_tag_t id)
+{
+	struct starpu_task *task = starpu_task_create();
+		task->cl_arg = NULL;
+
+	task->use_tag = 1;
+	task->tag_id = id;
+
+	return task;
+}
+
+
+static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_pivot_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_pivot_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_pivot)
+#endif
+};
+
+static starpu_codelet cl_pivot = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_pivot),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_pivot),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_pivot)
+};
+
+static void create_task_pivot(starpu_data_handle *dataAp, unsigned nblocks,
+					struct piv_s *piv_description,
+					unsigned k, unsigned i,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = create_task(PIVOT(k, i));
+
+	task->cl = &cl_pivot;
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, i);
+	task->buffers[0].mode = STARPU_RW;
+
+	task->cl_arg = &piv_description[k];
+
+	/* this is an important task */
+	if (!no_prio && (i == k+1))
+		task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k == 0) {
+		starpu_tag_declare_deps(PIVOT(k, i), 1, TAG11(k));
+	}
+	else 
+	{
+		if (i > k) {
+			starpu_tag_declare_deps(PIVOT(k, i), 2, TAG11(k), TAG22(k-1, i, k));
+		}
+		else {
+			starpu_tag_t *tags = malloc((nblocks - k)*sizeof(starpu_tag_t));
+			
+			tags[0] = TAG11(k);
+			unsigned ind, ind2;
+			for (ind = k + 1, ind2 = 0; ind < nblocks; ind++, ind2++)
+			{
+				tags[1 + ind2] = TAG22(k-1, ind, k);
+			}
+
+			/* perhaps we could do better ... :/  */
+			starpu_tag_declare_deps_array(PIVOT(k, i), (nblocks-k), tags);
+		}
+	}
+
+	starpu_submit_task(task);
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_11_pivot)
+#endif
+};
+
+static starpu_codelet cl11_pivot = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u11_pivot),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u11_pivot),
+#endif
+	.nbuffers = 1,
+	.model = &STARPU_LU(model_11_pivot)
+};
+
+static struct starpu_task *create_task_11_pivot(starpu_data_handle *dataAp, unsigned nblocks,
+					unsigned k, struct piv_s *piv_description,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = create_task(TAG11(k));
+
+	task->cl = &cl11_pivot;
+
+	task->cl_arg = &piv_description[k];
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k);
+	task->buffers[0].mode = STARPU_RW;
+
+	/* this is an important task */
+	if (!no_prio)
+		task->priority = MAX_PRIO;
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
+	}
+
+	return task;
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_12) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_12_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_12_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_12)
+#endif
+};
+
+static starpu_codelet cl12 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u12),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u12),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_12)
+};
+
+static void create_task_12(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned j,
+		starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+//	printf("task 12 k,i = %d,%d TAG = %llx\n", k,i, TAG12(k,i));
+
+	struct starpu_task *task = create_task(TAG12(k, j));
+	
+	task->cl = &cl12;
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k);
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, j, k);
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (j == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+#if 0
+	starpu_tag_declare_deps(TAG12(k, i), 1, PIVOT(k, i));
+#endif
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG12(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+	}
+	else {
+		starpu_tag_declare_deps(TAG12(k, j), 1, TAG11(k));
+	}
+
+	starpu_submit_task(task);
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_21) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_21_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_21_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_21)
+#endif
+};
+
+static starpu_codelet cl21 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u21),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u21),
+#endif
+	.nbuffers = 2,
+	.model = &STARPU_LU(model_21)
+};
+
+static void create_task_21(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i,
+				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct starpu_task *task = create_task(TAG21(k, i));
+
+	task->cl = &cl21;
+	
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, k); 
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, k, i); 
+	task->buffers[1].mode = STARPU_RW;
+
+	if (!no_prio && (i == k+1)) {
+		task->priority = MAX_PRIO;
+	}
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	/* enforce dependencies ... */
+	starpu_tag_declare_deps(TAG21(k, i), 1, PIVOT(k, i));
+#if 0
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG21(k, i), 3, TAG11(k), TAG22(k-1, k, i), PIVOT(k, i));
+	}
+	else {
+		starpu_tag_declare_deps(TAG21(k, i), 2, TAG11(k), PIVOT(k, i));
+	}
+#endif
+
+	starpu_submit_task(task);
+}
+
+static struct starpu_perfmodel_t STARPU_LU(model_22) = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_LU_STR(lu_model_22_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_LU_STR(lu_model_22_goto)
+#else
+	.symbol = STARPU_LU_STR(lu_model_22)
+#endif
+};
+
+static starpu_codelet cl22 = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_LU(cpu_u22),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_LU(cublas_u22),
+#endif
+	.nbuffers = 3,
+	.model = &STARPU_LU(model_22)
+};
+
+static void create_task_22(starpu_data_handle *dataAp, unsigned nblocks, unsigned k, unsigned i, unsigned j,
+				starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+//	printf("task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j));
+
+	struct starpu_task *task = create_task(TAG22(k, i, j));
+
+	task->cl = &cl22;
+
+	task->cl_arg = (void *)(task->tag_id);
+
+	/* which sub-data is manipulated ? */
+	task->buffers[0].handle = get_block(dataAp, nblocks, k, i); /* produced by TAG21(k, i) */
+	task->buffers[0].mode = STARPU_R;
+	task->buffers[1].handle = get_block(dataAp, nblocks, j, k); /* produced by TAG12(k, j) */ 
+	task->buffers[1].mode = STARPU_R;
+	task->buffers[2].handle = get_block(dataAp, nblocks, j, i);  /* produced by TAG22(k-1, i, j) */
+	task->buffers[2].mode = STARPU_RW;
+
+	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
+		task->priority = MAX_PRIO;
+	}
+
+	/* enforce dependencies ... */
+	if (k > 0) {
+		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG12(k, j), TAG21(k, i));
+	}
+	else {
+		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG12(k, j), TAG21(k, i));
+	}
+
+	starpu_submit_task(task);
+}
+
+/*
+ *	code to bootstrap the factorization 
+ */
+
+static double dw_codelet_facto_pivot(starpu_data_handle *dataAp,
+					struct piv_s *piv_description,
+					unsigned nblocks,
+					starpu_data_handle (* get_block)(starpu_data_handle *, unsigned, unsigned, unsigned))
+{
+	struct timeval start;
+	struct timeval end;
+
+	struct starpu_task *entry_task = NULL;
+
+	/* create all the DAG nodes */
+	unsigned i,j,k;
+
+	for (k = 0; k < nblocks; k++)
+	{
+		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
+
+		/* we defer the launch of the first task */
+		if (k == 0) {
+			entry_task = task;
+		}
+		else {
+			starpu_submit_task(task);
+		}
+
+		for (i = 0; i < nblocks; i++)
+		{
+			if (i != k)
+				create_task_pivot(dataAp, nblocks, piv_description, k, i, get_block);
+		}
+	
+		for (i = k+1; i<nblocks; i++)
+		{
+			create_task_12(dataAp, nblocks, k, i, get_block);
+			create_task_21(dataAp, nblocks, k, i, get_block);
+		}
+
+		for (i = k+1; i<nblocks; i++)
+		{
+			for (j = k+1; j<nblocks; j++)
+			{
+				create_task_22(dataAp, nblocks, k, i, j, get_block);
+			}
+		}
+	}
+
+	/* we wait the last task (TAG11(nblocks - 1)) and all the pivot tasks */
+	starpu_tag_t *tags = malloc(nblocks*nblocks*sizeof(starpu_tag_t));
+	unsigned ndeps = 0;
+
+	tags[ndeps++] = TAG11(nblocks - 1);
+
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < j; i++)
+		{
+			tags[ndeps++] = PIVOT(j, i);
+		}
+	}
+
+	/* schedule the codelet */
+	gettimeofday(&start, NULL);
+	int ret = starpu_submit_task(entry_task);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		fprintf(stderr, "No worker may execute this task\n");
+		exit(-1);
+	}
+
+	/* stall the application until the end of computations */
+	starpu_tag_wait_array(ndeps, tags);
+//	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	return timing;
+}
+
+starpu_data_handle get_block_with_striding(starpu_data_handle *dataAp,
+			unsigned nblocks __attribute__((unused)), unsigned j, unsigned i)
+{
+	/* we use filters */
+	return get_sub_data(*dataAp, 2, j, i);
+}
+
+
+void STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle dataA;
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	starpu_register_blas_data(&dataA, 0, (uintptr_t)matA, ld, size, size, sizeof(TYPE));
+
+	starpu_filter f;
+		f.filter_func = starpu_vertical_block_filter_func;
+		f.filter_arg = nblocks;
+
+	starpu_filter f2;
+		f2.filter_func = starpu_block_filter_func;
+		f2.filter_arg = nblocks;
+
+	starpu_map_filters(dataA, 2, &f, &f2);
+
+	unsigned i;
+	for (i = 0; i < size; i++)
+		ipiv[i] = i;
+
+	struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s));
+	unsigned block;
+	for (block = 0; block < nblocks; block++)
+	{
+		piv_description[block].piv = ipiv;
+		piv_description[block].first = block * (size / nblocks);
+		piv_description[block].last = (block + 1) * (size / nblocks);
+	}
+
+#if 0
+	unsigned j;
+	for (j = 0; j < nblocks; j++)
+	for (i = 0; i < nblocks; i++)
+	{
+		printf("BLOCK %d %d	%p\n", i, j, &matA[i*(size/nblocks) + j * (size/nblocks)*ld]);
+	}
+#endif
+
+	double timing;
+	timing = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding);
+
+	fprintf(stderr, "Computation took (in ms)\n");
+	fprintf(stderr, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataA);
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	/* gather all the data */
+	starpu_unpartition_data(dataA, 0);
+}
+
+
+starpu_data_handle get_block_with_no_striding(starpu_data_handle *dataAp, unsigned nblocks, unsigned j, unsigned i)
+{
+	/* dataAp is an array of data handle */
+	return dataAp[i+j*nblocks];
+}
+
+void STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
+{
+	starpu_data_handle *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle));
+
+	/* monitor and partition the A matrix into blocks :
+	 * one block is now determined by 2 unsigned (i,j) */
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		starpu_register_blas_data(&dataAp[bi+nblocks*bj], 0,
+			(uintptr_t)matA[bi+nblocks*bj], size/nblocks,
+			size/nblocks, size/nblocks, sizeof(TYPE));
+	}
+
+	unsigned i;
+	for (i = 0; i < size; i++)
+		ipiv[i] = i;
+
+	struct piv_s *piv_description = malloc(nblocks*sizeof(struct piv_s));
+	unsigned block;
+	for (block = 0; block < nblocks; block++)
+	{
+		piv_description[block].piv = ipiv;
+		piv_description[block].first = block * (size / nblocks);
+		piv_description[block].last = (block + 1) * (size / nblocks);
+	}
+
+	double timing;
+	timing = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding);
+
+	fprintf(stderr, "Computation took (in ms)\n");
+	fprintf(stderr, "%2.2f\n", timing/1000);
+
+	unsigned n = starpu_get_blas_nx(dataAp[0])*nblocks;
+	double flop = (2.0f*n*n*n)/3.0f;
+	fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		starpu_delete_data(dataAp[bi+nblocks*bj]);
+	}
+}

+ 26 - 0
examples/mult/dgemm.c

@@ -0,0 +1,26 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define TYPE	double
+
+#define CUBLAS_GEMM cublasDgemm
+#define CPU_GEMM	DGEMM
+#define CPU_ASUM	DASUM
+#define CPU_IAMAX	IDAMAX
+#define STARPU_GEMM(name)	starpu_dgemm_##name
+
+#include "xgemm_kernels.c"
+#include "xgemm.c" 

+ 29 - 105
examples/mult/dw_mult.c

@@ -23,9 +23,6 @@
 float *A, *B, *C;
 starpu_data_handle A_handle, B_handle, C_handle;
 
-pthread_mutex_t mutex;
-pthread_cond_t cond;
-
 /*
  * That program should compute C = A * B 
  * 
@@ -59,15 +56,7 @@ void terminate(void)
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim);
-	uint64_t total_ls = ls_cublas + ls_atlas;
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
-	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-	fprintf(stderr, "	GB : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_ls/1000000000.0f, (double)ls_cublas/1000000000.0f, (double)ls_atlas/1000000000.0f);
-	fprintf(stderr, "	GB/s : %2.2f\n", (double)total_ls / (double)timing/1000);
+	display_stats(timing);
 
 #ifdef CHECK_OUTPUT
 	/* check results */
@@ -86,78 +75,14 @@ void terminate(void)
 		fprintf(stderr, "There were errors ... err = %f\n", err);
 	}
 #endif // CHECK_OUTPUT
-
-	pthread_mutex_lock(&mutex);
-	pthread_cond_signal(&cond);
-	pthread_mutex_unlock(&mutex);
 }
 
 void callback_func(void *arg)
 {
-	/* the argument is a pointer to a counter of the remaining tasks */
-	int *counterptr = arg;
-
-	int counter = STARPU_ATOMIC_ADD(counterptr, -1);
-	if (counter == 0)
-	{
-		/* we are done */	
-		fprintf(stderr, "done ...\n");
-		terminate();
-	}
-
-	return;
-}
-
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	float *subA;			\
-	float *subB;			\
-	float *subC;			\
-					\
-	subA = (float *)descr[0].blas.ptr;	\
-	subB = (float *)descr[1].blas.ptr;	\
-	subC = (float *)descr[2].blas.ptr;	\
-					\
-	nxC = descr[2].blas.nx;		\
-	nyC = descr[2].blas.ny;		\
-	nyA = descr[0].blas.ny;		\
-					\
-	ldA = descr[0].blas.ld;		\
-	ldB = descr[1].blas.ld;		\
-	ldC = descr[2].blas.ld;
-
-
-
-#ifdef USE_CUDA
-void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
-					     0.0f, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ASSERT(0);
-
-	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
-
-	flop_cublas += flopcnt;
-	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
-}
-#endif
-
-void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
-
-	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
-	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
+	/* do some accounting */
+	int id = starpu_get_worker_id();
+	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
+	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
 }
 
 static void init_problem_data(void)
@@ -235,6 +160,8 @@ static void partition_mult_data(void)
 	starpu_register_blas_data(&C_handle, 0, (uintptr_t)C, 
 		ydim, ydim, xdim, sizeof(float));
 
+	starpu_data_set_wb_mask(C_handle, 1<<0);
+
 	conf.k = zdim;
 	conf.m = ydim/nslicesy;
 	conf.n = xdim/nslicesx;
@@ -253,6 +180,22 @@ static void partition_mult_data(void)
 	starpu_map_filters(C_handle, 2, &f, &f2);
 }
 
+static starpu_codelet cl = {
+	.where = CORE|CUDA|GORDON,
+	.core_func = core_mult,
+#ifdef USE_CUDA
+	.cuda_func = cublas_mult,
+#endif
+#ifdef USE_GORDON
+#ifdef SPU_FUNC_SGEMM
+	.gordon_func = SPU_FUNC_SGEMM,
+#else
+#warning SPU_FUNC_SGEMM is not available
+#endif
+#endif
+	.nbuffers = 3
+};
+
 static void launch_codelets(void)
 {
 #ifdef USE_FXT
@@ -261,26 +204,8 @@ static void launch_codelets(void)
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
-	taskcounter = nslicesx * nslicesy;
-
 	srand(time(NULL));
 
-	starpu_codelet cl = {
-		.where = CORE|CUBLAS|GORDON,
-		.core_func = core_mult,
-#ifdef USE_CUDA
-		.cublas_func = cublas_mult,
-#endif
-#ifdef USE_GORDON
-#ifdef SPU_FUNC_SGEMM
-		.gordon_func = SPU_FUNC_SGEMM,
-#else
-#warning SPU_FUNC_SGEMM is not available
-#endif
-#endif
-		.nbuffers = 3
-	};
-
 	/* should we use a single performance model for all archs and use an
  	 * acceleration factor ? */
 	if (use_common_model) {
@@ -302,7 +227,7 @@ static void launch_codelets(void)
 			task->cl_arg_size = sizeof(struct block_conf);
 
 			task->callback_func = callback_func;
-			task->callback_arg = &taskcounter;
+			task->callback_arg = NULL;
 
 			starpu_tag_t tag = TAG(taskx, tasky); 
 
@@ -330,9 +255,7 @@ int main(__attribute__ ((unused)) int argc,
 
 	/* start the runtime */
 	starpu_init(NULL);
-
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
+	starpu_helper_init_cublas();
 
 	init_problem_data();
 
@@ -340,10 +263,11 @@ int main(__attribute__ ((unused)) int argc,
 
 	launch_codelets();
 
-	pthread_mutex_lock(&mutex);
-	pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	starpu_wait_all_tasks();
 
+	terminate();
+	
+	starpu_helper_shutdown_cublas();
 	starpu_shutdown();
 
 	return 0;

+ 36 - 0
examples/mult/dw_mult.h

@@ -63,14 +63,17 @@ unsigned zdim = 64;
 unsigned norandom = 0;
 unsigned pin = 0;
 unsigned use_common_model = 0;
+unsigned check = 0;
 
 /* to compute MFlop/s */
 uint64_t flop_cublas = 0;
 uint64_t flop_atlas = 0;
+uint64_t flop_per_worker[STARPU_NMAXWORKERS] = {0};
 
 /* to compute MB/s (load/store) */
 uint64_t ls_cublas = 0;
 uint64_t ls_atlas = 0;
+uint64_t ls_per_worker[STARPU_NMAXWORKERS] = {0};
 
 
 struct timeval start;
@@ -83,6 +86,30 @@ static struct block_conf conf __attribute__ ((aligned (128)));
 #define BLOCKSIZEY	(ydim / nslicesy)
 #define BLOCKSIZEZ	(zdim / nslicesz)
 
+static void display_stats(double timing)
+{
+	unsigned worker;
+	unsigned nworkers = starpu_get_worker_count();
+
+	fprintf(stderr, "Computation took (ms):\n");
+	printf("%2.2f\n", timing/1000);
+
+	uint64_t flop_total = 0, ls_total = 0;
+	
+	for (worker = 0; worker < nworkers; worker++)
+	{
+		flop_total += flop_per_worker[worker];
+		ls_total += ls_per_worker[worker];
+
+		char name[32];
+		starpu_get_worker_name(worker, name, 32);
+
+		fprintf(stderr, "\t%s -> %2.2f GFlop\t%2.2f GFlop/s\n", name, (double)flop_per_worker[worker]/1000000000.0f, (double)flop_per_worker[worker]/(double)timing/1000);
+	}
+
+	fprintf(stderr, "Total: %2.2f GFlops\t%2.2f GFlop/s\n", (double)flop_total/1000000000.0f, (double)flop_total/(double)timing/1000);
+}
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -138,6 +165,10 @@ static void parse_args(int argc, char **argv)
 			pin = 1;
 		}
 
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
 		if (strcmp(argv[i], "-common-model") == 0) {
 			use_common_model = 1;
 		}
@@ -162,5 +193,10 @@ static void display_memory_consumption(void)
 		+ ydim*xdim*sizeof(float))/(1024*1024) );
 }
 
+#ifdef USE_CUDA
+void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg);
+#endif
+
+void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg);
 
 #endif // __MULT_H__

+ 0 - 256
examples/mult/dw_mult_no_filters.c

@@ -1,256 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "dw_mult.h"
-
-
-float *A, *B, *C;
-starpu_data_handle A_handle, B_handle, C_handle;
-
-/*
- * That program should compute C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
-
-              |---------------|
-            z |       B       |
-              |---------------|
-       z              x
-     |----|   |---------------|
-     |    |   |               |
-     |    |   |               |
-     | A  | y |       C       |
-     |    |   |               |
-     |    |   |               |
-     |----|   |---------------|
-
- */
-
-static void terminate(void)
-{
-	starpu_delete_data(C_handle);
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	uint64_t total_flop = niter*BLAS3_FLOP(ydim, xdim, zdim);
-	uint64_t total_ls = niter*(ls_cublas + ls_atlas);
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
-	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-	fprintf(stderr, "	GB : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_ls/1000000000.0f, (double)ls_cublas/1000000000.0f, (double)ls_atlas/1000000000.0f);
-	fprintf(stderr, "	GB/s : %2.2f\n", (double)total_ls / (double)timing/1000);
-
-#ifdef CHECK_OUTPUT
-	/* check results */
-	/* compute C = C - niter * AB */
-
-	SGEMM("N", "N", ydim, xdim, zdim, -1.0f*niter, A, ydim, B, zdim, 1.0f, C, ydim);
-		
-	/* make sure C = 0 */
-	float err;
-	err = SASUM(xdim*ydim, C, 1);	
-	
-	if (err < xdim*ydim*0.001) {
-		fprintf(stderr, "Results are OK\n");
-	}
-	else {
-		fprintf(stderr, "There were errors ... err = %f\n", err);
-	}
-#endif // CHECK_OUTPUT
-}
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	float *subA;			\
-	float *subB;			\
-	float *subC;			\
-					\
-	subA = (float *)descr[0].blas.ptr;	\
-	subB = (float *)descr[1].blas.ptr;	\
-	subC = (float *)descr[2].blas.ptr;	\
-					\
-	nxC = descr[2].blas.nx;		\
-	nyC = descr[2].blas.ny;		\
-	nyA = descr[0].blas.ny;		\
-					\
-	ldA = descr[0].blas.ld;		\
-	ldB = descr[1].blas.ld;		\
-	ldC = descr[2].blas.ld;
-
-
-
-#ifdef USE_CUDA
-void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
-					     0.0f, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ASSERT(0);
-
-	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
-
-	flop_cublas += flopcnt;
-	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
-}
-#endif
-
-void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
-
-	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
-	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
-}
-
-static void init_problem_data(void)
-{
-	unsigned i,j;
-
-#ifdef USE_CUDA
-	if (pin) {
-		starpu_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
-		starpu_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
-		starpu_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float));
-	} else
-#endif
-	{
-#ifdef HAVE_POSIX_MEMALIGN
-		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
-		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
-		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
-#else
-		A = malloc(zdim*ydim*sizeof(float));
-		B = malloc(xdim*zdim*sizeof(float));
-		C = malloc(xdim*ydim*sizeof(float));
-#endif
-	}
-
-	/* fill the A and B matrices */
-	if (norandom) {
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (float)(i);
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (float)(j);
-			}
-		}
-	} 
-	else {
-#ifdef NORANDOM
-		srand(2009);
-		STARPU_ASSERT(0);
-#endif
-		for (j=0; j < ydim; j++) {
-			for (i=0; i < zdim; i++) {
-				A[j+i*ydim] = (float)(drand48());
-			}
-		}
-	
-		for (j=0; j < zdim; j++) {
-			for (i=0; i < xdim; i++) {
-				B[j+i*zdim] = (float)(drand48());
-			}
-		}
-	}
-
-	for (j=0; j < ydim; j++) {
-		for (i=0; i < xdim; i++) {
-			C[j+i*ydim] = (float)(0);
-		}
-	}
-
-	display_memory_consumption();
-
-	starpu_register_blas_data(&A_handle, 0, (uintptr_t)A, 
-		ydim, ydim, zdim, sizeof(float));
-	starpu_register_blas_data(&B_handle, 0, (uintptr_t)B, 
-		zdim, zdim, xdim, sizeof(float));
-	starpu_register_blas_data(&C_handle, 0, (uintptr_t)C, 
-		ydim, ydim, xdim, sizeof(float));
-
-	gettimeofday(&start, NULL);
-}
-
-static void launch_codelets(void)
-{
-	srand(time(NULL));
-
-	starpu_codelet cl = {
-		.where = CORE|CUBLAS,
-		.core_func = core_mult,
-#ifdef USE_CUDA
-		.cublas_func = cublas_mult,
-#endif
-		.model = &sgemm_model,
-		.nbuffers = 3
-	};
-
-	unsigned iter;
-	for (iter = 0; iter < niter; iter++) 
-	{
-		struct starpu_task *task = starpu_task_create();
-
-		task->cl = &cl;
-
-		task->buffers[0].handle = A_handle;
-		task->buffers[0].mode = STARPU_R;
-		task->buffers[1].handle = B_handle;
-		task->buffers[1].mode = STARPU_R;
-		task->buffers[2].handle = C_handle;
-		task->buffers[2].mode = STARPU_RW;
-
-		task->synchronous = 1;
-		starpu_submit_task(task);
-	}
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	init_problem_data();
-
-	launch_codelets();
-
-	terminate();
-
-	starpu_shutdown();
-
-	return 0;
-}

+ 17 - 131
examples/mult/dw_mult_no_stride.c

@@ -19,9 +19,6 @@
 #include "gordon/func_sgemm_ibm.h"
 #endif
 
-static pthread_mutex_t mutex;
-static pthread_cond_t cond;
-
 float *A[MAXSLICESY][MAXSLICESZ];
 float *B[MAXSLICESZ][MAXSLICESX];
 float *C[MAXSLICESY][MAXSLICESX];
@@ -30,16 +27,6 @@ starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
 starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
 starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
 
-/* fortran ordering ... */
-#define FULLA(i,j)	\
-	(A[(i)/BLOCKSIZEY][(j)/BLOCKSIZEZ][(i)%BLOCKSIZEY + ((j)%BLOCKSIZEZ)*BLOCKSIZEY])
-
-#define FULLB(i,j)	\
-	(B[(i)/BLOCKSIZEZ][(j)/BLOCKSIZEX][(i)%BLOCKSIZEZ + ((j)%BLOCKSIZEX)*BLOCKSIZEZ])
-
-#define FULLC(i,j)	\
-	(C[(i)/BLOCKSIZEY][(j)/BLOCKSIZEX][(i)%BLOCKSIZEY + ((j)%BLOCKSIZEX)*BLOCKSIZEY])
-
 #define TAG(x,y,z,iter)	\
 		((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
 
@@ -66,92 +53,12 @@ static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
 
  */
 
-static void terminate(void)
-{
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim)*niter;
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
-	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-
-	pthread_mutex_lock(&mutex);
-	pthread_cond_signal(&cond);
-	pthread_mutex_unlock(&mutex);
-}
-
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	float *subA;			\
-	float *subB;			\
-	float *subC;			\
-					\
-	subA = (float *)descr[0].blas.ptr;	\
-	subB = (float *)descr[1].blas.ptr;	\
-	subC = (float *)descr[2].blas.ptr;	\
-					\
-	nxC = descr[2].blas.nx;		\
-	nyC = descr[2].blas.ny;		\
-	nyA = descr[0].blas.ny;		\
-					\
-	ldA = descr[0].blas.ld;		\
-	ldB = descr[1].blas.ld;		\
-	ldC = descr[2].blas.ld;
-
-
-
-#ifdef USE_CUDA
-static void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
-					     1.0f, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ASSERT(0);
-
-	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
-
-	flop_cublas += flopcnt;
-	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
-}
-#endif
-
-static void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-//	fprintf(stderr, "Call SGEMM : nxC %d nyC %d nyA %d subA %p ldA %d subB %p ldB %d subC %p ldC %d\n",
-//				nxC, nyC, nyA, subA, ldA, subB, ldB, subC, ldC);
-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 1.0f, subC, ldC);
-
-	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
-	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
-}
-
 #define MEM_ALIGNMENT	16
 
 static void init_problem_data(void)
 {
 	unsigned i,j;
 
-	/* debug ... */
-	memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(float *));
-	memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(float *));
-	memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(float *));
-	memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
-	memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
-
 	/* Allocate grids of buffer */
 	/* TODO pin ... */
 	unsigned z, y, x;
@@ -313,26 +220,23 @@ static void cleanup_problem(void)
 	
 }
 
-int xycounter;
-
 struct cb2_s {
 	unsigned blockx;
 	unsigned blocky;
 	unsigned iter;
-	int *xycounter;
 };
 
 static starpu_codelet cl = {
 	.core_func = core_mult,
 #ifdef USE_CUDA
-	.cublas_func = cublas_mult,
+	.cuda_func = cublas_mult,
 #endif
 #ifdef USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
 #endif
 
 	.model = &sgemm_model,
-	.where = CORE|CUBLAS|GORDON,
+	.where = CORE|CUDA|GORDON,
 	.nbuffers = 3
 };
 
@@ -381,23 +285,6 @@ static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, un
 	return task;
 }
 
-
-static void callback_func(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	int *counter = arg;
-	int newvalue = STARPU_ATOMIC_ADD(counter, -1);
-	if (newvalue == 0)
-	{
-		/* we are done */	
-		fprintf(stderr, "done ...\n");
-		terminate();
-	}
-
-	return;
-}
-
-
 static void callback_func_2(void *arg)
 {
 	/* the argument is a pointer to a counter of the remaining tasks */
@@ -410,7 +297,10 @@ static void callback_func_2(void *arg)
 
 	free(cb2);
 
-//	fprintf(stderr, "func 2 for x %d y %d iter %d\n", x, y, iter);
+	/* do some accounting */
+	int id = starpu_get_worker_id();
+	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
+	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
 
 	/* TAG(nslicesz - 1, y, x, iter) remains ... */
 	for (z = 0; z < nslicesz - 1; z++)
@@ -423,10 +313,7 @@ static void callback_func_2(void *arg)
 		starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
 	}
 	
-	if (iter == niter - 1) {
-		callback_func(&xycounter);
-	}
-	else {
+	if (iter != niter - 1) {
 		submit_new_iter(x, y, iter+1);
 	}
 }
@@ -450,7 +337,6 @@ static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
 				cb2->blockx = x;
 				cb2->blocky = y;
 				cb2->iter = iter;
-				cb2->xycounter = &xycounter;
 			task->callback_func = callback_func_2;
 			task->callback_arg = cb2;
 		}
@@ -467,9 +353,6 @@ static void launch_codelets(void)
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
-	/* only a callback per (nslicesz * niter) task given deps */
-	xycounter = nslicesx * nslicesy;
-
 	srand(time(NULL));
 
 	gettimeofday(&start, NULL);
@@ -490,24 +373,27 @@ int main(__attribute__ ((unused)) int argc,
 	/* start the runtime */
 	starpu_init(NULL);
 
+	starpu_helper_init_cublas();
+
 #ifdef USE_GORDON
 	load_elf_sgemm();
 #endif
 
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
-
 	init_problem_data();
 
 	launch_codelets();
 
-	pthread_mutex_lock(&mutex);
-	pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+	display_stats(timing);
+
 
 	cleanup_problem();
 
-	exit(-1);
+	starpu_helper_shutdown_cublas();
 	starpu_shutdown();
 
 	return 0;

+ 17 - 109
examples/mult/dw_mult_no_stride_no_tag.c

@@ -20,9 +20,6 @@
 #endif
 
 
-static pthread_mutex_t mutex;
-static pthread_cond_t cond;
-
 struct pos {
 	unsigned x,y, z,iter;
 };
@@ -60,78 +57,6 @@ static void callback_func_3(void *arg);
 
  */
 
-static void terminate(void)
-{
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	uint64_t total_flop = BLAS3_FLOP(ydim, xdim, zdim)*niter;
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "	GFlop : total (%2.2f) cublas (%2.2f) atlas (%2.2f)\n", (double)total_flop/1000000000.0f, (double)flop_cublas/1000000000.0f, (double)flop_atlas/1000000000.0f);
-	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-
-	pthread_mutex_lock(&mutex);
-	pthread_cond_signal(&cond);
-	pthread_mutex_unlock(&mutex);
-}
-
-
-#define COMMON_CODE			\
-	uint32_t nxC, nyC, nyA;		\
-	uint32_t ldA, ldB, ldC;		\
-					\
-	float *subA;			\
-	float *subB;			\
-	float *subC;			\
-					\
-	subA = (float *)descr[0].blas.ptr;	\
-	subB = (float *)descr[1].blas.ptr;	\
-	subC = (float *)descr[2].blas.ptr;	\
-					\
-	nxC = descr[2].blas.nx;		\
-	nyC = descr[2].blas.ny;		\
-	nyA = descr[0].blas.ny;		\
-					\
-	ldA = descr[0].blas.ld;		\
-	ldB = descr[1].blas.ld;		\
-	ldC = descr[2].blas.ld;
-
-
-
-#ifdef USE_CUDA
-static void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
-{
-	COMMON_CODE
-
-	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
-					     1.0f, subC, ldC);
-	cublasStatus st;
-	st = cublasGetError();
-	if (st != CUBLAS_STATUS_SUCCESS)
-		STARPU_ASSERT(0);
-
-	uint64_t flopcnt = BLAS3_FLOP(nyC, nxC, nyA);
-
-	flop_cublas += flopcnt;
-	ls_cublas += BLAS3_LS(nyC, nxC, nyA);
-}
-#endif
-
-static void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
-{
-	COMMON_CODE
-
-//	fprintf(stderr, "Call SGEMM : nxC %d nyC %d nyA %d subA %p ldA %d subB %p ldB %d subC %p ldC %d\n",
-//				nxC, nyC, nyA, subA, ldA, subB, ldB, subC, ldC);
-	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 1.0f, subC, ldC);
-
-	flop_atlas += BLAS3_FLOP(nxC, nyC, nyA);
-	ls_atlas += BLAS3_LS(nxC, nyC, nyA);
-}
-
 #define MEM_ALIGNMENT	16
 
 static void init_problem_data(void)
@@ -347,21 +272,18 @@ static void cleanup_problem(void)
 	
 }
 
-int xycounter;
-
 struct cb2_s {
 	unsigned blockx;
 	unsigned blocky;
 	unsigned iter;
-	int *xycounter;
 };
 
 
 static starpu_codelet cl = {
-	.where = CORE|CUBLAS|GORDON,
+	.where = CORE|CUDA|GORDON,
 	.core_func = core_mult,
 #ifdef USE_CUDA
-	.cublas_func = cublas_mult,
+	.cuda_func = cublas_mult,
 #endif
 #ifdef USE_GORDON
 	/* .gordon_func will be set by load_elf_sgemm */
@@ -419,23 +341,13 @@ static void construct_task(unsigned x, unsigned y, unsigned z, unsigned iter, st
 }
 
 
-static void callback_func(void *arg)
-{
-	/* the argument is a pointer to a counter of the remaining tasks */
-	int *counter = arg;
-	int newvalue = STARPU_ATOMIC_ADD(counter, -1);
-	if (newvalue == 0)
-	{
-		/* we are done */	
-		fprintf(stderr, "done ...\n");
-		terminate();
-	}
-
-	return;
-}
-
 static void callback_func_3(void *arg)
 {
+	/* do some accounting */
+	int id = starpu_get_worker_id();
+	flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
+	ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
+
 	/* the argument is a pointer to a counter of the remaining tasks */
 	struct pos *posp = arg;
 	unsigned x,y,z,iter;
@@ -455,10 +367,6 @@ static void callback_func_3(void *arg)
 		{
 			construct_task(x, y, 0, iter+1, posp);
 		}
-		else
-		{
-			callback_func(&xycounter);
-		}
 	}
 }
 
@@ -473,9 +381,6 @@ static void launch_codelets(void)
 	/* partition the work into slices */
 	unsigned taskx, tasky;
 
-	/* only a callback per (nslicesz * niter) task given deps */
-	xycounter = nslicesx * nslicesy;
-
 	srand(time(NULL));
 
 	gettimeofday(&start, NULL);
@@ -496,24 +401,27 @@ int main(__attribute__ ((unused)) int argc,
 	/* start the runtime */
 	starpu_init(NULL);
 
+	starpu_helper_init_cublas();
+
 #ifdef USE_GORDON
 	load_elf_sgemm();
 #endif
 
-	pthread_mutex_init(&mutex, NULL);
-	pthread_cond_init(&cond, NULL);
-
 	init_problem_data();
 
 	launch_codelets();
 
-	pthread_mutex_lock(&mutex);
-	pthread_cond_wait(&cond, &mutex);
-	pthread_mutex_unlock(&mutex);
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	display_stats(timing);
 
 	cleanup_problem();
 
-	exit(-1);
+	starpu_helper_shutdown_cublas();
 	starpu_shutdown();
 
 	return 0;

+ 8 - 12
src/drivers/cuda/comp_cuda.h

@@ -14,17 +14,13 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#ifndef __COMP_CUDA_H__
-#define __COMP_CUDA_H__
+#define TYPE	float
 
-#include <stdio.h>
-#include <string.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <cuda.h>
+#define CUBLAS_GEMM cublasSgemm
+#define CPU_GEMM	SGEMM
+#define CPU_ASUM	SASUM
+#define CPU_IAMAX	ISAMAX
+#define STARPU_GEMM(name)	starpu_sgemm_##name
 
-#define UPDIV(a,b)	(((a)+(b)-1)/((b)))
-
-__device__ void cuda_dummy_mult(CUdeviceptr, CUdeviceptr, CUdeviceptr);
-
-#endif // __COMP_CUDA_H__
+#include "xgemm_kernels.c"
+#include "xgemm.c" 

+ 69 - 0
examples/mult/sgemm_kernels.c

@@ -0,0 +1,69 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/blas.h>
+
+#define COMMON_CODE			\
+	uint32_t nxC, nyC, nyA;		\
+	uint32_t ldA, ldB, ldC;		\
+					\
+	float *subA;			\
+	float *subB;			\
+	float *subC;			\
+					\
+	subA = (float *)descr[0].blas.ptr;	\
+	subB = (float *)descr[1].blas.ptr;	\
+	subC = (float *)descr[2].blas.ptr;	\
+					\
+	nxC = descr[2].blas.nx;		\
+	nyC = descr[2].blas.ny;		\
+	nyA = descr[0].blas.ny;		\
+					\
+	ldA = descr[0].blas.ld;		\
+	ldB = descr[1].blas.ld;		\
+	ldC = descr[2].blas.ld;
+
+
+
+#ifdef USE_CUDA
+void cublas_mult(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	COMMON_CODE
+
+	starpu_trace_user_event(0x42);
+
+	cublasSgemm('n', 'n', nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 
+					     0.0f, subC, ldC);
+	cublasStatus st;
+	st = cublasGetError();
+	if (st != CUBLAS_STATUS_SUCCESS)
+		STARPU_ASSERT(0);
+
+	cudaThreadSynchronize();
+
+	starpu_trace_user_event(0x43);
+}
+#endif
+
+void core_mult(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	COMMON_CODE
+
+	starpu_trace_user_event(0x42);
+	SGEMM("N", "N", nxC, nyC, nyA, 1.0f, subA, ldA, subB, ldB, 0.0f, subC, ldC);
+	starpu_trace_user_event(0x43);
+}

+ 261 - 0
examples/mult/xgemm.c

@@ -0,0 +1,261 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "dw_mult.h"
+
+#define str(s) #s
+#define xstr(s)        str(s)
+#define STARPU_GEMM_STR(name)  xstr(STARPU_GEMM(name))
+
+TYPE *A, *B, *C;
+starpu_data_handle A_handle, B_handle, C_handle;
+
+/*
+ * That program should compute C = A * B 
+ * 
+ *   A of size (z,y)
+ *   B of size (x,z)
+ *   C of size (x,y)
+
+              |---------------|
+            z |       B       |
+              |---------------|
+       z              x
+     |----|   |---------------|
+     |    |   |               |
+     |    |   |               |
+     | A  | y |       C       |
+     |    |   |               |
+     |    |   |               |
+     |----|   |---------------|
+
+ */
+
+static void check_output(void)
+{
+	/* check results */
+	/* compute C = C - AB */
+
+	CPU_GEMM("N", "N", ydim, xdim, zdim, (TYPE)-1.0, A, ydim, B, zdim, (TYPE)1.0f, C, ydim);
+		
+	/* make sure C = 0 */
+	TYPE err;
+	err = CPU_ASUM(xdim*ydim, C, 1);
+
+	int max;
+	max = CPU_IAMAX(xdim*ydim, C, 1);
+
+	fprintf(stderr, "Avg error : %e\n", err/(xdim*ydim));
+	fprintf(stderr, "Max error : %e\n", C[max]);
+}
+
+void callback_func(void *arg)
+{
+	/* do some accounting */
+	int id = starpu_get_worker_id();
+	flop_per_worker[id] += BLAS3_FLOP(conf.m, conf.n, conf.k);
+	ls_per_worker[id] += BLAS3_LS(conf.m, conf.n, conf.k);
+}
+
+static void init_problem_data(void)
+{
+	unsigned i,j;
+
+#ifdef USE_CUDA
+	if (pin) {
+		starpu_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(TYPE));
+		starpu_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(TYPE));
+		starpu_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(TYPE));
+	} else
+#endif
+	{
+#ifdef HAVE_POSIX_MEMALIGN
+		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(TYPE));
+		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(TYPE));
+		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(TYPE));
+#else
+		A = malloc(zdim*ydim*sizeof(TYPE));
+		B = malloc(xdim*zdim*sizeof(TYPE));
+		C = malloc(xdim*ydim*sizeof(TYPE));
+#endif
+	}
+
+	/* fill the A and B matrices */
+	if (norandom) {
+		for (j=0; j < ydim; j++) {
+			for (i=0; i < zdim; i++) {
+				A[j+i*ydim] = (TYPE)(i);
+			}
+		}
+	
+		for (j=0; j < zdim; j++) {
+			for (i=0; i < xdim; i++) {
+				B[j+i*zdim] = (TYPE)(j);
+			}
+		}
+	} 
+	else {
+		for (j=0; j < ydim; j++) {
+			for (i=0; i < zdim; i++) {
+				A[j+i*ydim] = (TYPE)(drand48());
+			}
+		}
+	
+		for (j=0; j < zdim; j++) {
+			for (i=0; i < xdim; i++) {
+				B[j+i*zdim] = (TYPE)(drand48());
+			}
+		}
+	}
+
+	for (j=0; j < ydim; j++) {
+		for (i=0; i < xdim; i++) {
+			C[j+i*ydim] = (TYPE)(0);
+		}
+	}
+
+	/* display memory consumption */
+	fprintf(stderr, "Total memory : %ld MB\n",
+		( ydim*zdim*sizeof(TYPE)
+		+ zdim*xdim*sizeof(TYPE)
+		+ ydim*xdim*sizeof(TYPE) )/(1024*1024));
+
+}
+
+static void partition_mult_data(void)
+{
+	starpu_register_blas_data(&A_handle, 0, (uintptr_t)A, 
+		ydim, ydim, zdim, sizeof(TYPE));
+	starpu_register_blas_data(&B_handle, 0, (uintptr_t)B, 
+		zdim, zdim, xdim, sizeof(TYPE));
+	starpu_register_blas_data(&C_handle, 0, (uintptr_t)C, 
+		ydim, ydim, xdim, sizeof(TYPE));
+
+	starpu_data_set_wb_mask(C_handle, 1<<0);
+
+	conf.k = zdim;
+	conf.m = ydim/nslicesy;
+	conf.n = xdim/nslicesx;
+
+	starpu_filter f;
+	f.filter_func = starpu_vertical_block_filter_func;
+	f.filter_arg = nslicesx;
+		
+	starpu_filter f2;
+	f2.filter_func = starpu_block_filter_func;
+	f2.filter_arg = nslicesy;
+		
+	starpu_partition_data(B_handle, &f);
+	starpu_partition_data(A_handle, &f2);
+
+	starpu_map_filters(C_handle, 2, &f, &f2);
+}
+
+static void unpartition_mult_data(void)
+{
+	starpu_unpartition_data(C_handle, 0);
+
+	starpu_delete_data(C_handle);
+}
+
+static struct starpu_perfmodel_t gemm_model = {
+	.type = HISTORY_BASED,
+#ifdef ATLAS
+	.symbol = STARPU_GEMM_STR(gemm_atlas)
+#elif defined(GOTO)
+	.symbol = STARPU_GEMM_STR(gemm_goto)
+#else
+	.symbol = STARPU_GEMM_STR(gemm)
+#endif
+};
+
+static starpu_codelet cl = {
+	.where = CORE|CUDA,
+	.core_func = STARPU_GEMM(core_mult),
+#ifdef USE_CUDA
+	.cuda_func = STARPU_GEMM(cublas_mult),
+#endif
+	.model = &gemm_model,
+	.nbuffers = 3
+};
+
+static void launch_codelets(void)
+{
+	/* partition the work into slices */
+	unsigned taskx, tasky;
+
+	for (taskx = 0; taskx < nslicesx; taskx++) 
+	{
+		for (tasky = 0; tasky < nslicesy; tasky++)
+		{
+			/* A B[task] = C[task] */
+			struct starpu_task *task = starpu_task_create();
+
+			task->cl = &cl;
+			task->cl_arg = &conf;
+			task->cl_arg_size = sizeof(struct block_conf);
+
+			/* we have a callback to do some accounting */
+			task->callback_func = callback_func;
+			task->callback_arg = NULL;
+
+			task->buffers[0].handle = get_sub_data(A_handle, 1, tasky);
+			task->buffers[0].mode = STARPU_R;
+			task->buffers[1].handle = get_sub_data(B_handle, 1, taskx);
+			task->buffers[1].mode = STARPU_R;
+			task->buffers[2].handle = 
+				get_sub_data(C_handle, 2, taskx, tasky);
+			task->buffers[2].mode = STARPU_RW;
+
+			starpu_submit_task(task);
+		}
+	}
+}
+
+int main(__attribute__ ((unused)) int argc, 
+	 __attribute__ ((unused)) char **argv)
+{
+
+	parse_args(argc, argv);
+
+	/* start the runtime */
+	starpu_init(NULL);
+	starpu_helper_init_cublas();
+
+	init_problem_data();
+
+	gettimeofday(&start, NULL);
+
+	partition_mult_data();
+
+	launch_codelets();
+	starpu_wait_all_tasks();
+
+	gettimeofday(&end, NULL);
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
+					(end.tv_usec - start.tv_usec));
+	display_stats(timing);
+
+	unpartition_mult_data();
+	
+	if (check)
+		check_output();
+	
+	starpu_helper_shutdown_cublas();
+	starpu_shutdown();
+
+	return 0;
+}

+ 71 - 0
examples/mult/xgemm_kernels.c

@@ -0,0 +1,71 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/blas.h>
+
+#define COMMON_CODE			\
+	uint32_t nxC, nyC, nyA;		\
+	uint32_t ldA, ldB, ldC;		\
+					\
+	TYPE *subA;			\
+	TYPE *subB;			\
+	TYPE *subC;			\
+					\
+	subA = (TYPE *)descr[0].blas.ptr;	\
+	subB = (TYPE *)descr[1].blas.ptr;	\
+	subC = (TYPE *)descr[2].blas.ptr;	\
+					\
+	nxC = descr[2].blas.nx;		\
+	nyC = descr[2].blas.ny;		\
+	nyA = descr[0].blas.ny;		\
+					\
+	ldA = descr[0].blas.ld;		\
+	ldB = descr[1].blas.ld;		\
+	ldC = descr[2].blas.ld;
+
+
+
+#ifdef USE_CUDA
+void STARPU_GEMM(cublas_mult)(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
+{
+	COMMON_CODE
+
+	starpu_trace_user_event(0x42);
+
+	CUBLAS_GEMM('n', 'n', nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+					     (TYPE)0.0, subC, ldC);
+	cublasStatus st;
+	st = cublasGetError();
+	if (st != CUBLAS_STATUS_SUCCESS)
+		STARPU_ASSERT(0);
+
+	cudaThreadSynchronize();
+
+	starpu_trace_user_event(0x42);
+}
+#endif
+
+void STARPU_GEMM(core_mult)(starpu_data_interface_t *descr, __attribute__((unused))  void *arg)
+{
+	COMMON_CODE
+
+	starpu_trace_user_event(0x42);
+	CPU_GEMM("N", "N", nxC, nyC, nyA, (TYPE)1.0, subA, ldA, subB, ldB,
+					  (TYPE)0.0, subC, ldC);
+
+	starpu_trace_user_event(0x43);
+}

+ 10 - 10
examples/pastix-wrappers/starpu-blas-wrapper.c

@@ -200,8 +200,8 @@ static data_state work_block_2;
 
 void allocate_maxbloktab_on_cublas(starpu_data_interface_t *descr __attribute__((unused)), void *arg __attribute__((unused)))
 {
-	request_data_allocation(&work_block_1, 1);
-	request_data_allocation(&work_block_2, 1);
+	starpu_request_data_allocation(&work_block_1, 1);
+	starpu_request_data_allocation(&work_block_2, 1);
 
 
 	starpu_filter f1, f2;
@@ -233,8 +233,8 @@ void STARPU_DECLARE_WORK_BLOCKS(float *maxbloktab1, float *maxbloktab2, unsigned
 	sem_t sem;
 
 	/* initialize codelet */
-	cl.where = CUBLAS;
-	cl.cublas_func = allocate_maxbloktab_on_cublas;
+	cl.where = CUDA;
+	cl.cuda_func = allocate_maxbloktab_on_cublas;
 	
 	j = job_create();
 	j->cb = _cublas_cblk_strsm_callback;
@@ -321,9 +321,9 @@ void STARPU_CBLK_STRSM(unsigned col)
 	sem_t sem;
 
 	/* initialize codelet */
-	cl.where = CORE|CUBLAS;
+	cl.where = CORE|CUDA;
 	cl.core_func = _core_cblk_strsm;
-	cl.cublas_func = _cublas_cblk_strsm;
+	cl.cuda_func = _cublas_cblk_strsm;
 	
 	j = job_create();
 //	j->where = (starpu_get_blas_nx(&cblktab[col]) > BLOCK && starpu_get_blas_ny(&cblktab[col]) > BLOCK)? CUBLAS:CORE;
@@ -461,9 +461,9 @@ void STARPU_COMPUTE_CONTRIB_COMPACT(unsigned col, int dimi, int dimj, int dima,
 	sem_t sem;
 
 	/* initialize codelet */
-	cl.where = CUBLAS|CORE;
+	cl.where = CUDA|CORE;
 	cl.core_func = _core_compute_contrib_compact;
-	cl.cublas_func = _cublas_compute_contrib_compact;
+	cl.cuda_func = _cublas_compute_contrib_compact;
 	
 	j = job_create();
 
@@ -603,9 +603,9 @@ void STARPU_SGEMM (const char *transa, const char *transb, const int m,
 	starpu_register_blas_data(&C_state, 0, (uintptr_t)C, ldc, m, n, sizeof(float));
 
 	/* initialize codelet */
-	cl.where = CUBLAS;
+	cl.where = CUDA;
 	//cl.core_func = _core_strsm;
-	cl.cublas_func = _cublas_sgemm;
+	cl.cuda_func = _cublas_sgemm;
 	
 	j = job_create();
 	j->cb = _cublas_sgemm_callback;

+ 1 - 1
examples/ppm-downscaler/ppm-downscaler.c

@@ -18,7 +18,7 @@
 
 #include <starpu.h>
 
-#ifdef HAVE_MALLOC_H_HEADER
+#ifdef HAVE_MALLOC_H
 #include <malloc.h>
 #endif
 #include <stdlib.h>

+ 1 - 0
examples/ppm-downscaler/yuv-downscaler.c

@@ -18,6 +18,7 @@
 
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <unistd.h>
 #include <assert.h>
 #include <stdio.h>

+ 2 - 2
examples/spmv/dw_block_spmv.c

@@ -115,10 +115,10 @@ void call_filters(void)
 unsigned totaltasks;
 
 starpu_codelet cl = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.core_func =  core_block_spmv,
 #ifdef USE_CUDA
-	.cublas_func = cublas_block_spmv,
+	.cuda_func = cublas_block_spmv,
 #endif
 	.nbuffers = 3
 };

+ 18 - 44
examples/spmv/dw_spmv.c

@@ -26,43 +26,31 @@ struct timeval end;
 unsigned nblocks = 1;
 unsigned remainingtasks = -1;
 
-/* First a Matrix-Vector product (SpMV) */
-
-unsigned blocks = 512;
-unsigned grids  = 8;
-
 #ifdef USE_CUDA
-/* CUDA spmv codelet */
-static struct starpu_cuda_module_s cuda_module;
-static struct starpu_cuda_function_s cuda_function;
-static starpu_cuda_codelet_t cuda_spmv;
 
-void initialize_cuda(void)
-{
-	char module_path[1024];
-	sprintf(module_path,
-		"%s/examples/cuda/spmv_cuda.cubin", STARPUDIR);
-	char *function_symbol = "spmv_kernel_3";
+extern void spmv_kernel_cpu_wrapper(uint32_t nnz, uint32_t nrow, float *nzval,
+			uint32_t *colind, uint32_t *rowptr, uint32_t firstentry,
+			float *vecin, uint32_t nx_in,
+			float * vecout, uint32_t nx_out);
 
-	starpu_init_cuda_module(&cuda_module, module_path);
-	starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
-
-	cuda_spmv.func = &cuda_function;
-	cuda_spmv.stack = NULL;
-	cuda_spmv.stack_size = 0; 
+void spmv_kernel_cuda(starpu_data_interface_t *buffers, void *args)
+{
+	uint32_t nnz = buffers[0].csr.nnz;
+	uint32_t nrow = buffers[0].csr.nrow;
+	float *nzval = (float *)buffers[0].csr.nzval;
+	uint32_t *colind = buffers[0].csr.colind;
+	uint32_t *rowptr = buffers[0].csr.rowptr;
+	uint32_t firstentry = buffers[0].csr.firstentry;
 
-	cuda_spmv.gridx = grids;
-	cuda_spmv.gridy = 1;
+	float *vecin = (float *)buffers[1].vector.ptr;
+	uint32_t nx_in = buffers[1].vector.nx;
 
-	cuda_spmv.blockx = blocks;
-	cuda_spmv.blocky = 1;
+	float *vecout = (float *)buffers[2].vector.ptr;
+	uint32_t nx_out = buffers[2].vector.nx;
 
-	cuda_spmv.shmemsize = 60;
+	spmv_kernel_cpu_wrapper(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
 }
 
-
-
-
 #endif // USE_CUDA
 
 
@@ -91,16 +79,6 @@ void parse_args(int argc, char **argv)
 			size = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-block") == 0) {
-			char *argptr;
-			blocks = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-grid") == 0) {
-			char *argptr;
-			grids = strtol(argv[++i], &argptr, 10);
-		}
-
 		if (strcmp(argv[i], "-nblocks") == 0) {
 			char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
@@ -274,7 +252,7 @@ void call_spmv_codelet_filters(void)
 	cl->where = CORE|CUDA;
 	cl->core_func =  core_spmv;
 #ifdef USE_CUDA
-	cl->cuda_func = &cuda_spmv;
+	cl->cuda_func = spmv_kernel_cuda;
 #endif
 	cl->nbuffers = 3;
 
@@ -330,10 +308,6 @@ int main(__attribute__ ((unused)) int argc,
 
 	sem_init(&sem, 0, 0U);
 
-#ifdef USE_CUDA
-	initialize_cuda();
-#endif
-
 	init_problem();
 
 	sem_wait(&sem);

+ 12 - 29
examples/cuda/spmv_cuda.cu

@@ -49,9 +49,9 @@ void spmv_kernel(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, ui
 }
 
 extern "C" __global__ 
-void spmv_kernel_2(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
-			uint32_t firstentry, uint32_t elemsize, 
-			float *vecin, uint32_t nx_in, uint32_t elemsize1, float * vecout, uint32_t nx_out, uint32_t elemsize2)
+void spmv_kernel_3(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
+			uint32_t firstentry, 
+			float *vecin, uint32_t nx_in, float * vecout, uint32_t nx_out)
 {
 	/* only one dimension is used here */
 	unsigned block_rowstart = blockIdx.x*( (nrow + gridDim.x - 1)/gridDim.x );
@@ -77,34 +77,17 @@ void spmv_kernel_2(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind,
 
 }
 
-
-
-extern "C" __global__ 
-void spmv_kernel_3(uint32_t nnz, uint32_t nrow, float *nzval, uint32_t *colind, uint32_t *rowptr, 
-			uint32_t firstentry, uint32_t elemsize, 
-			float *vecin, uint32_t nx_in, uint32_t elemsize1, float * vecout, uint32_t nx_out, uint32_t elemsize2)
+extern "C" void spmv_kernel_cpu_wrapper(uint32_t nnz, uint32_t nrow, float *nzval,
+			uint32_t *colind, uint32_t *rowptr, uint32_t firstentry,
+			float *vecin, uint32_t nx_in,
+			float * vecout, uint32_t nx_out)
 {
-	/* only one dimension is used here */
-	unsigned block_rowstart = blockIdx.x*( (nrow + gridDim.x - 1)/gridDim.x );
-	unsigned block_rowend = MIN((blockIdx.x+1)*( (nrow + gridDim.x - 1)/gridDim.x ), nrow);
+	dim3 dimBlock(8, 1);
+	dim3 dimGrid(512, 1);
 
-	unsigned row;
-	for (row = block_rowstart + threadIdx.x; row < block_rowend; row+=blockDim.x)
-	{
-		float tmp = 0.0f;
-		unsigned index;
-
-		unsigned firstindex = rowptr[row] - firstentry;
-		unsigned lastindex = rowptr[row+1] - firstentry;
-
-		for (index = firstindex; index < lastindex; index++)
-		{
-			tmp += nzval[index]*vecin[colind[index]];
-		}
-
-		vecout[row] = tmp;
-	}
-	
+	spmv_kernel_3<<<dimGrid, dimBlock>>>(nnz, nrow, nzval, colind, rowptr,
+						firstentry, vecin, nx_in, vecout, nx_out);
 
+	cudaThreadSynchronize();
 }
 

+ 60 - 0
examples/starpufft/Makefile.am

@@ -0,0 +1,60 @@
+#
+# StarPU
+# Copyright (C) INRIA 2009 (see AUTHORS file)
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+#
+AM_CPPFLAGS = -I$(top_srcdir)/include/
+
+lib_LTLIBRARIES = libstarpufft.la
+
+EXTRA_DIST =			\
+	starpufft.h		\
+	float.h			\
+	double.h		\
+	cudax_kernels.h		\
+	starpufftx.c		\
+	starpufftx1d.c		\
+	starpufftx2d.c		\
+	cuda_kernels.cu		\
+	cudaf_kernels.cu	\
+	cudax_kernels.cu	\
+	testx.c			\
+	testx_threads.c	
+	
+
+libstarpufft_la_SOURCES = starpufft.c starpufftf.c starpufft-common.c
+libstarpufft_la_LIBADD = $(top_builddir)/src/libstarpu.la $(FFTW_LIBS) $(FFTWF_LIBS)
+libstarpufft_la_CFLAGS = $(FFTWF_CFLAGS)
+
+if USE_CUDA
+# TODO define NVCCFLAGS
+NVCC ?= nvcc
+NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC
+
+cuda_kernels.o: cuda_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir} -arch sm_13 
+
+cudaf_kernels.o: cudaf_kernels.cu
+	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS) -I${includedir}
+
+libstarpufft_la_SOURCES += cuda_kernels.cu cudaf_kernels.cu
+am_libstarpufft_la_OBJECTS = cuda_kernels.o cudaf_kernels.o starpufft.lo starpufftf.lo starpufft-common.lo
+libstarpufft_la_LIBADD += -lcufft
+endif
+
+check_PROGRAMS = test testf
+test_LDADD = libstarpufft.la $(top_builddir)/src/libstarpu.la $(FFTW_LIBS)
+testf_LDADD = libstarpufft.la $(top_builddir)/src/libstarpu.la $(FFTWF_LIBS)
+#test_threads_LDADD = libstarpufft.la $(top_builddir)/src/libstarpu.la -lfftw3_threads
+#testf_threads_LDADD = libstarpufft.la $(top_builddir)/src/libstarpu.la -lfftw3f_threads

+ 18 - 0
examples/starpufft/cuda_kernels.cu

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "cudax_kernels.cu"

+ 18 - 0
examples/starpufft/cudaf_kernels.cu

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "cudax_kernels.cu"

+ 142 - 0
examples/starpufft/cudax_kernels.cu

@@ -0,0 +1,142 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define _externC extern "C"
+#include "cudax_kernels.h"
+
+/* Note: these assume that the sizes are powers of two */
+
+#define VARS_1d \
+	unsigned start = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned numthreads = blockDim.x * gridDim.x;
+
+#define DISTRIB_1d(n, func,args,stream) \
+	unsigned threads_per_block = 128; \
+\
+	if (n < threads_per_block) { \
+		dim3 dimGrid(n); \
+		func <<<dimGrid, 1, stream>>> args; \
+	} else { \
+		dim3 dimGrid(n / threads_per_block); \
+		dim3 dimBlock(threads_per_block); \
+		func <<<dimGrid, dimBlock, stream>>> args; \
+	} \
+	cudaStreamSynchronize(stream); \
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_1d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n2;
+
+	for (j = start; j < end; j += numthreads)
+		twisted1[j] = in[i+j*n1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream)
+{
+	DISTRIB_1d(n2, STARPUFFT(cuda_twist1_1d), (in, twisted1, i, n1, n2), stream);
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_1d)(_cuComplex * out, const _cuComplex * roots, unsigned n, unsigned i)
+{
+	unsigned j;
+	VARS_1d
+	unsigned end = n;
+
+	for (j = start; j < end; j += numthreads)
+		out[j] = _cuCmul(out[j], roots[i*j]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream)
+{
+	DISTRIB_1d(n, STARPUFFT(cuda_twiddle_1d), (out, roots, n, i), stream);
+}
+
+#define VARS_2d \
+	unsigned startx = threadIdx.x + blockIdx.x * blockDim.x; \
+	unsigned starty = threadIdx.y + blockIdx.y * blockDim.y; \
+	unsigned numthreadsx = blockDim.x * gridDim.x; \
+	unsigned numthreadsy = blockDim.y * gridDim.y;
+
+#define DISTRIB_2d(n, m, func, args, stream) \
+	unsigned threads_per_dim = 16; \
+	if (n < threads_per_dim) { \
+		if (m < threads_per_dim) { \
+			dim3 dimGrid(n, m); \
+			func <<<dimGrid, 1, stream>>> args; \
+		} else { \
+			dim3 dimGrid(1, m / threads_per_dim); \
+			dim3 dimBlock(n, threads_per_dim); \
+			func <<<dimGrid, dimBlock, stream>>> args; \
+		} \
+	} else {  \
+		if (m < threads_per_dim) { \
+			dim3 dimGrid(n / threads_per_dim, 1); \
+			dim3 dimBlock(threads_per_dim, m); \
+			func <<<dimGrid, dimBlock, stream>>> args; \
+		} else { \
+			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
+			dim3 dimBlock(threads_per_dim, threads_per_dim); \
+			func <<<dimGrid, dimBlock, stream>>> args; \
+		} \
+	} \
+	cudaStreamSynchronize(stream);
+
+extern "C" __global__ void
+STARPUFFT(cuda_twist1_2d)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+	unsigned m = m1*m2;
+
+	for (k = startx; k < endx; k += numthreadsx)
+		for (l = starty; l < endy; l += numthreadsy)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+extern "C" void
+STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twist1_2d), (in, twisted1, i, j, n1, n2, m1, m2), stream);
+}
+
+extern "C" __global__ void
+STARPUFFT(cuda_twiddle_2d)(_cuComplex * out, const _cuComplex * roots0, const _cuComplex * roots1, unsigned n2, unsigned m2, unsigned i, unsigned j)
+{
+	unsigned k, l;
+	VARS_2d
+	unsigned endx = n2;
+	unsigned endy = m2;
+
+	for (k = startx; k < endx ; k += numthreadsx)
+		for (l = starty; l < endy ; l += numthreadsy)
+			out[k*m2 + l] = _cuCmul(_cuCmul(out[k*m2 + l], roots0[i*k]), roots1[j*l]);
+	return;
+}
+
+extern "C" void
+STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream)
+{
+	DISTRIB_2d(n2, m2, STARPUFFT(cuda_twiddle_2d), (out, roots0, roots1, n2, m2, i, j), stream);
+}

+ 21 - 0
examples/starpufft/cudax_kernels.h

@@ -0,0 +1,21 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <cuComplex.h>
+_externC void STARPUFFT(cuda_twist1_1d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned n1, unsigned n2, cudaStream_t stream);
+_externC void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComplex *roots, unsigned n, unsigned i, cudaStream_t stream);
+_externC void STARPUFFT(cuda_twist1_2d_host)(const _cuComplex *in, _cuComplex *twisted1, unsigned i, unsigned j, unsigned n1, unsigned n2, unsigned m1, unsigned m2, cudaStream_t stream);
+_externC void STARPUFFT(cuda_twiddle_2d_host)(_cuComplex *out, const _cuComplex *roots0, const _cuComplex *roots1, unsigned n2, unsigned m2, unsigned i, unsigned j, cudaStream_t stream);

+ 47 - 0
examples/starpufft/double.h

@@ -0,0 +1,47 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef USE_CUDA
+#include <cufft.h>
+#endif
+
+typedef double real;
+#ifdef HAVE_FFTW
+typedef fftw_complex _fftw_complex;
+typedef fftw_plan _fftw_plan;
+#endif
+#ifdef USE_CUDA
+typedef cuDoubleComplex _cuComplex;
+typedef cufftDoubleComplex _cufftComplex;
+#define _cufftExecC2C cufftExecZ2Z
+#define _cufftExecR2C cufftExecD2Z
+#define _cufftExecC2R cufftExecZ2D
+#define _CUFFT_C2C CUFFT_Z2Z
+#define _CUFFT_R2C CUFFT_D2Z
+#define _CUFFT_C2R CUFFT_Z2D
+#define _cuCmul(x,y) cuCmul(x,y)
+#endif
+#define STARPUFFT(name) starpufft_##name
+#define _FFTW(name) fftw_##name
+
+#define TYPE ""

+ 47 - 0
examples/starpufft/float.h

@@ -0,0 +1,47 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <starpu_config.h>
+
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+
+#ifdef USE_CUDA
+#include <cufft.h>
+#endif
+
+typedef float real;
+#ifdef HAVE_FFTW
+typedef fftwf_complex _fftw_complex;
+typedef fftwf_plan _fftw_plan;
+#endif
+#ifdef USE_CUDA
+typedef cuComplex _cuComplex;
+typedef cufftComplex _cufftComplex;
+#define _cufftExecC2C cufftExecC2C
+#define _cufftExecR2C cufftExecR2C
+#define _cufftExecC2R cufftExecC2R
+#define _CUFFT_C2C CUFFT_C2C
+#define _CUFFT_R2C CUFFT_R2C
+#define _CUFFT_C2R CUFFT_C2R
+#define _cuCmul(x,y) cuCmulf(x,y)
+#endif
+#define STARPUFFT(name) starpufftf_##name
+#define _FFTW(name) fftwf_##name
+
+#define TYPE "f"

+ 19 - 0
examples/starpufft/starpufft-common.c

@@ -0,0 +1,19 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "starpufft.h"
+
+int starpufft_last_plan_number;

+ 18 - 0
examples/starpufft/starpufft.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "starpufftx.c"

+ 54 - 0
examples/starpufft/starpufft.h

@@ -0,0 +1,54 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <complex.h>
+#include <starpu.h>
+
+#define STARPUFFT_FORWARD -1
+#define STARPUFFT_INVERSE 1
+
+#define __STARPUFFT(name) starpufft_##name
+#define __STARPUFFTF(name) starpufftf_##name
+#define __STARPUFFTL(name) starpufftl_##name
+
+#define __STARPUFFT_INTERFACE(starpufft,real) \
+typedef real _Complex starpufft(complex); \
+\
+typedef struct starpufft(plan) *starpufft(plan); \
+\
+starpufft(plan) starpufft(plan_dft_1d)(int n, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_2d)(int n, int m, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_r2c_1d)(int n, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
+\
+void *starpufft(malloc)(size_t n); \
+void starpufft(free)(void *p); \
+\
+void starpufft(execute)(starpufft(plan) p, void *in, void *out); \
+starpu_tag_t starpufft(start)(starpufft(plan) p, void *in, void *out); \
+\
+void starpufft(destroy_plan)(starpufft(plan) p); \
+\
+void starpufft(startstats)(void); \
+void starpufft(stopstats)(void); \
+void starpufft(showstats)(FILE *out);
+
+__STARPUFFT_INTERFACE(__STARPUFFT, double)
+__STARPUFFT_INTERFACE(__STARPUFFTF, float)
+__STARPUFFT_INTERFACE(__STARPUFFTL, long double)
+
+int starpufft_last_plan_number;

+ 18 - 0
examples/starpufft/starpufftf.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "starpufftx.c"

+ 378 - 0
examples/starpufft/starpufftx.c

@@ -0,0 +1,378 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+#include <config.h>
+
+#include "starpufft.h"
+#ifdef USE_CUDA
+#define _externC extern
+#include "cudax_kernels.h"
+#endif
+
+#define _FFTW_FLAGS FFTW_ESTIMATE
+
+enum steps {
+	SPECIAL, TWIST1, FFT1, JOIN, TWIST2, FFT2, TWIST3, END
+};
+
+#define NUMBER_BITS 5
+#define NUMBER_SHIFT (64 - NUMBER_BITS)
+#define STEP_BITS 3
+#define STEP_SHIFT (NUMBER_SHIFT - STEP_BITS)
+
+#define _STEP_TAG(plan, step, i) (((starpu_tag_t) plan->number << NUMBER_SHIFT) | ((starpu_tag_t)(step) << STEP_SHIFT) | (starpu_tag_t) (i))
+
+
+#define I_BITS STEP_SHIFT
+
+enum type {
+	R2C,
+	C2R,
+	C2C
+};
+
+static unsigned task_per_worker[STARPU_NMAXWORKERS];
+static unsigned samples_per_worker[STARPU_NMAXWORKERS];
+static struct timeval start, submit_tasks, end;
+
+/*
+ *
+ *	The actual kernels
+ *
+ */
+
+struct STARPUFFT(plan) {
+	int number;	/* uniquely identifies the plan, for starpu tags */
+
+	int *n;
+	int *n1;
+	int *n2;
+	int totsize;
+	int totsize1;	/* Number of first-round tasks */
+	int totsize2;	/* Size of first-round tasks */
+	int totsize3;	/* Number of second-round tasks */
+	int totsize4;	/* Size of second-round tasks */
+	int dim;
+	enum type type;
+	int sign;
+
+	STARPUFFT(complex) *roots[2];
+	starpu_data_handle roots_handle[2];
+
+	struct {
+#ifdef USE_CUDA
+		cufftHandle plan1_cuda, plan2_cuda;
+		int initialized1, initialized2;
+		cudaStream_t stream;
+		int stream_is_initialized;
+#endif
+#ifdef HAVE_FFTW
+		_fftw_plan plan1_cpu, plan2_cpu;
+		_fftw_complex *in1, *out1;
+		_fftw_complex *in2, *out2;
+#endif
+	} plans[STARPU_NMAXWORKERS];
+
+#ifdef HAVE_FFTW
+	_fftw_plan plan_gather;
+#endif
+
+	STARPUFFT(complex) *in, *twisted1, *fft1, *twisted2, *fft2, *out;
+
+	starpu_data_handle in_handle, *twisted1_handle, *fft1_handle, *twisted2_handle, *fft2_handle;
+	struct starpu_task **twist1_tasks, **fft1_tasks, **twist2_tasks, **fft2_tasks, **twist3_tasks;
+	struct starpu_task *join_task, *end_task;
+	struct STARPUFFT(args) *fft1_args, *fft2_args;
+};
+
+struct STARPUFFT(args) {
+	struct STARPUFFT(plan) *plan;
+	int i, j, jj, kk, ll, *iv, *kkv;
+};
+
+#ifdef USE_CUDA
+cudaStream_t
+STARPUFFT(get_local_stream)(STARPUFFT(plan) plan, int workerid)
+{
+	if (!plan->plans[workerid].stream_is_initialized)
+	{
+		cudaStreamCreate(&plan->plans[workerid].stream);
+
+		plan->plans[workerid].stream_is_initialized = 1;
+	}
+
+	return plan->plans[workerid].stream;
+}
+#endif
+
+static void
+check_dims(STARPUFFT(plan) plan)
+{
+	int dim;
+	for (dim = 0; dim < plan->dim; dim++)
+		if (plan->n[dim] & (plan->n[dim]-1)) {
+			fprintf(stderr,"can't cope with non-power-of-2\n");
+			STARPU_ASSERT(0);
+		}
+}
+
+static void
+compute_roots(STARPUFFT(plan) plan)
+{
+	int dim, k;
+
+	/* Compute the n-roots and m-roots of unity for twiddling */
+	for (dim = 0; dim < plan->dim; dim++) {
+		STARPUFFT(complex) exp = (plan->sign * 2. * 4.*atan(1.)) * _Complex_I / (STARPUFFT(complex)) plan->n[dim];
+		plan->roots[dim] = malloc(plan->n[dim] * sizeof(**plan->roots));
+		for (k = 0; k < plan->n[dim]; k++)
+			plan->roots[dim][k] = cexp(exp*k);
+		starpu_register_vector_data(&plan->roots_handle[dim], 0, (uintptr_t) plan->roots[dim], plan->n[dim], sizeof(**plan->roots));
+
+#ifdef USE_CUDA
+		if (plan->n[dim] > 100000) {
+			/* prefetch the big root array on GPUs */
+			unsigned worker;
+			unsigned nworkers = starpu_get_worker_count();
+			for (worker = 0; worker < nworkers; worker++)
+			{
+				unsigned node = starpu_get_worker_memory_node(worker);
+				if (starpu_get_worker_type(worker) == STARPU_CUDA_WORKER)
+					starpu_prefetch_data_on_node(plan->roots_handle[dim], node, 0);
+			}
+		}
+#endif
+	}
+}
+
+#include "starpufftx1d.c"
+#include "starpufftx2d.c"
+
+starpu_tag_t
+STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
+{
+	starpu_tag_t tag;
+	int z;
+
+	plan->in = _in;
+	plan->out = _out;
+
+	switch (plan->dim) {
+		case 1: {
+			switch (plan->type) {
+			case C2C:
+				starpu_register_vector_data(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+				for (z = 0; z < plan->totsize1; z++)
+					plan->twist1_tasks[z]->buffers[0].handle = plan->in_handle;
+				tag = STARPUFFT(start1dC2C)(plan);
+				break;
+			default:
+				STARPU_ASSERT(0);
+				break;
+			}
+			break;
+		}
+		case 2:
+			starpu_register_vector_data(&plan->in_handle, 0, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+			for (z = 0; z < plan->totsize1; z++)
+				plan->twist1_tasks[z]->buffers[0].handle = plan->in_handle;
+			tag = STARPUFFT(start2dC2C)(plan);
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+	return tag;
+}
+
+void
+STARPUFFT(cleanup)(STARPUFFT(plan) plan)
+{
+	starpu_delete_data(plan->in_handle);
+}
+
+void
+STARPUFFT(execute)(STARPUFFT(plan) plan, void *in, void *out)
+{
+	memset(task_per_worker, 0, sizeof(task_per_worker));
+	memset(samples_per_worker, 0, sizeof(task_per_worker));
+
+	gettimeofday(&start, NULL);
+
+	starpu_tag_t tag = STARPUFFT(start)(plan, in, out);
+	gettimeofday(&submit_tasks, NULL);
+	starpu_tag_wait(tag);
+
+	STARPUFFT(cleanup)(plan);
+
+	gettimeofday(&end, NULL);
+}
+
+void
+STARPUFFT(destroy_plan)(STARPUFFT(plan) plan)
+{
+	int workerid, dim, i;
+
+	for (workerid = 0; workerid < starpu_get_worker_count(); workerid++) {
+		switch (starpu_get_worker_type(workerid)) {
+		case STARPU_CORE_WORKER:
+#ifdef HAVE_FFTW
+			_FFTW(free)(plan->plans[workerid].in1);
+			_FFTW(free)(plan->plans[workerid].out1);
+			_FFTW(destroy_plan)(plan->plans[workerid].plan1_cpu);
+			_FFTW(free)(plan->plans[workerid].in2);
+			_FFTW(free)(plan->plans[workerid].out2);
+			_FFTW(destroy_plan)(plan->plans[workerid].plan2_cpu);
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+#ifdef USE_CUDA
+			/* FIXME: Can't deallocate */
+#endif
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+		}
+	}
+	for (i = 0; i < plan->totsize1; i++) {
+		starpu_delete_data(plan->twisted1_handle[i]);
+		free(plan->twist1_tasks[i]);
+		starpu_delete_data(plan->fft1_handle[i]);
+		free(plan->fft1_tasks[i]);
+	}
+
+	free(plan->twisted1_handle);
+	free(plan->twist1_tasks);
+	free(plan->fft1_handle);
+	free(plan->fft1_tasks);
+	free(plan->fft1_args);
+
+	free(plan->join_task);
+
+	for (i = 0; i < plan->totsize3; i++) {
+		starpu_delete_data(plan->twisted2_handle[i]);
+		free(plan->twist2_tasks[i]);
+		starpu_delete_data(plan->fft2_handle[i]);
+		free(plan->fft2_tasks[i]);
+		free(plan->twist3_tasks[i]);
+	}
+
+	free(plan->end_task);
+
+	free(plan->twisted2_handle);
+	free(plan->twist2_tasks);
+	free(plan->fft2_handle);
+	free(plan->fft2_tasks);
+	free(plan->twist3_tasks);
+	free(plan->fft2_args);
+
+	for (dim = 0; dim < plan->dim; dim++) {
+		starpu_delete_data(plan->roots_handle[dim]);
+		free(plan->roots[dim]);
+	}
+
+	switch (plan->dim) {
+		case 1:
+			STARPUFFT(free_1d_tags)(plan);
+			break;
+		case 2:
+			STARPUFFT(free_2d_tags)(plan);
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+	}
+
+	free(plan->n);
+	free(plan->n1);
+	free(plan->n2);
+	STARPUFFT(free)(plan->twisted1);
+	STARPUFFT(free)(plan->fft1);
+	STARPUFFT(free)(plan->twisted2);
+	STARPUFFT(free)(plan->fft2);
+#ifdef HAVE_FFTW
+	_FFTW(destroy_plan)(plan->plan_gather);
+#endif
+	free(plan);
+}
+
+void *
+STARPUFFT(malloc)(size_t n)
+{
+#ifdef USE_CUDA
+	void *res;
+	starpu_malloc_pinned_if_possible(&res, n);
+	return res;
+#else
+#  ifdef HAVE_FFTW
+	return _FFTW(malloc)(n);
+#  else
+	return malloc(n);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(free)(void *p)
+{
+#ifdef USE_CUDA
+	// TODO: FIXME
+#else
+#  ifdef HAVE_FFTW
+	_FFTW(free)(p);
+#  else
+	free(p);
+#  endif
+#endif
+}
+
+void
+STARPUFFT(showstats)(FILE *out)
+{
+	int worker;
+	unsigned total;
+
+#define TIMING(begin,end) (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec))
+#define MSTIMING(begin,end) (TIMING(begin,end)/1000.)
+	double paratiming = TIMING(start,end);
+	fprintf(out, "Tasks submission took %2.2f ms\n", MSTIMING(start,submit_tasks));
+	fprintf(out, "Tasks termination took %2.2f ms\n", MSTIMING(submit_tasks,end));
+
+	fprintf(out, "Total %2.2f ms\n", MSTIMING(start,end));
+
+	for (worker = 0, total = 0; worker < STARPU_NMAXWORKERS; worker++)
+		total += task_per_worker[worker];
+
+	for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
+	{
+		if (task_per_worker[worker])
+		{
+			char name[32];
+			starpu_get_worker_name(worker, name, 32);
+
+			unsigned long bytes = sizeof(STARPUFFT(complex))*samples_per_worker[worker];
+
+			fprintf(stderr, "\t%s -> %2.2f MB\t%2.2f\tMB/s\t%u %2.2f %%\n", name, (1.0*bytes)/(1024*1024), bytes/paratiming, task_per_worker[worker], (100.0*task_per_worker[worker])/total);
+		}
+	}
+}

+ 640 - 0
examples/starpufft/starpufftx1d.c

@@ -0,0 +1,640 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define DIV_1D 64
+
+#define STEP_TAG_1D(plan, step, i) _STEP_TAG(plan, step, i)
+
+#ifdef USE_CUDA
+/* Twist the full vector into a n2 chunk */
+static void
+STARPUFFT(twist1_1d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)descr[1].vector.ptr;
+	
+	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_get_worker_id());
+
+	STARPUFFT(cuda_twist1_1d_host)(in, twisted1, i, n1, n2, stream);
+
+	cudaStreamSynchronize(stream);
+}
+
+/* Perform an n2 fft */
+static void
+STARPUFFT(fft1_1d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int n2 = plan->n2[0];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict out = (_cufftComplex *)descr[1].vector.ptr;
+	const _cufftComplex * restrict roots = (_cufftComplex *)descr[2].vector.ptr;
+
+	int workerid = starpu_get_worker_id();
+
+	cudaStream_t stream;
+
+	if (!plan->plans[workerid].initialized1) {
+		cures = cufftPlan1d(&plan->plans[workerid].plan1_cuda, n2, _CUFFT_C2C, 1);
+
+		stream = STARPUFFT(get_local_stream)(plan, workerid);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
+
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		plan->plans[workerid].initialized1 = 1;
+	}
+
+	stream = plan->plans[workerid].stream;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	STARPUFFT(cuda_twiddle_1d_host)(out, roots, n2, i, stream);
+
+	cudaStreamSynchronize(plan->plans[workerid].stream);
+}
+
+static void
+STARPUFFT(fft2_1d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict out = (_cufftComplex *)descr[1].vector.ptr;
+
+	int workerid = starpu_get_worker_id();
+
+	if (!plan->plans[workerid].initialized2) {
+		cures = cufftPlan1d(&plan->plans[workerid].plan2_cuda, n1, _CUFFT_C2C, n3);
+
+		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
+
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		plan->plans[workerid].initialized2 = 1;
+	}
+
+	/* NOTE using batch support */
+	cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	cudaStreamSynchronize(plan->plans[workerid].stream);
+}
+#endif
+
+/* Twist the full vector into a n2 chunk */
+static void
+STARPUFFT(twist1_1d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	//printf("twist1 %d %g\n", i, (double) cabs(plan->in[i]));
+
+	for (j = 0; j < n2; j++)
+		twisted1[j] = in[i+j*n1];
+}
+
+#ifdef HAVE_FFTW
+/* Perform an n2 fft */
+static void
+STARPUFFT(fft1_1d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j;
+	int n2 = plan->n2[0];
+	int workerid = starpu_get_worker_id();
+
+	const STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) * restrict fft1 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	_fftw_complex * restrict worker_in1 = (STARPUFFT(complex) *)plan->plans[workerid].in1;
+	_fftw_complex * restrict worker_out1 = (STARPUFFT(complex) *)plan->plans[workerid].out1;
+
+	//printf("fft1 %d %g\n", i, (double) cabs(twisted1[0]));
+
+	memcpy(worker_in1, twisted1, plan->totsize2 * sizeof(*worker_in1));
+	_FFTW(execute)(plan->plans[workerid].plan1_cpu);
+
+	for (j = 0; j < n2; j++)
+		fft1[j] = worker_out1[j] * plan->roots[0][i*j];
+}
+#endif
+
+/* Twist the full vector into a package of n2/DIV_1D (n1) chunks */
+static void
+STARPUFFT(twist2_1d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+
+	//printf("twist2 %d %g\n", jj, (double) cabs(plan->fft1[jj]));
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			twisted2[jjj*n1+i] = plan->fft1[i*n2+j];
+	}
+}
+
+#ifdef HAVE_FFTW
+/* Perform n2/DIV_1D (n1) ffts */
+static void
+STARPUFFT(fft2_1d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	//int jj = args->jj;
+	int workerid = starpu_get_worker_id();
+
+	const STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	//printf("fft2 %d %g\n", jj, (double) cabs(twisted2[plan->totsize4-1]));
+
+	_fftw_complex * restrict worker_in2 = (STARPUFFT(complex) *)plan->plans[workerid].in2;
+	_fftw_complex * restrict worker_out2 = (STARPUFFT(complex) *)plan->plans[workerid].out2;
+
+	memcpy(worker_in2, twisted2, plan->totsize4 * sizeof(*worker_in2));
+	_FFTW(execute)(plan->plans[workerid].plan2_cpu);
+	/* no twiddle */
+	memcpy(fft2, worker_out2, plan->totsize4 * sizeof(*worker_out2));
+}
+#endif
+
+/* Spread the package of n2/DIV_1D (n1) chunks into the full vector */
+static void
+STARPUFFT(twist3_1d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int jj = args->jj;	/* between 0 and DIV_1D */
+	int jjj;		/* beetween 0 and n3 */
+	int i;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int n3 = n2/DIV_1D;
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+
+	//printf("twist3 %d %g\n", jj, (double) cabs(fft2[0]));
+
+	for (jjj = 0; jjj < n3; jjj++) {
+		int j = jj * n3 + jjj;
+		for (i = 0; i < n1; i++)
+			plan->out[i*n2+j] = fft2[jjj*n1+i];
+	}
+}
+
+static struct starpu_perfmodel_t STARPUFFT(twist1_1d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist1_1d"
+};
+
+static struct starpu_perfmodel_t STARPUFFT(fft1_1d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"fft1_1d"
+};
+
+static struct starpu_perfmodel_t STARPUFFT(twist2_1d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist2_1d"
+};
+
+static struct starpu_perfmodel_t STARPUFFT(fft2_1d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"fft2_1d"
+};
+
+static struct starpu_perfmodel_t STARPUFFT(twist3_1d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist3_1d"
+};
+
+static starpu_codelet STARPUFFT(twist1_1d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+		CORE,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(twist1_1d_kernel_gpu),
+#endif
+	.core_func = STARPUFFT(twist1_1d_kernel_cpu),
+	.model = &STARPUFFT(twist1_1d_model),
+	.nbuffers = 2
+};
+
+static starpu_codelet STARPUFFT(fft1_1d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+#ifdef HAVE_FFTW
+		CORE|
+#endif
+		0,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(fft1_1d_kernel_gpu),
+#endif
+#ifdef HAVE_FFTW
+	.core_func = STARPUFFT(fft1_1d_kernel_cpu),
+#endif
+	.model = &STARPUFFT(fft1_1d_model),
+	.nbuffers = 3
+};
+
+static starpu_codelet STARPUFFT(twist2_1d_codelet) = {
+	.where = CORE,
+	.core_func = STARPUFFT(twist2_1d_kernel_cpu),
+	.model = &STARPUFFT(twist2_1d_model),
+	.nbuffers = 1
+};
+
+static starpu_codelet STARPUFFT(fft2_1d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+#ifdef HAVE_FFTW
+		CORE|
+#endif
+		0,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(fft2_1d_kernel_gpu),
+#endif
+#ifdef HAVE_FFTW
+	.core_func = STARPUFFT(fft2_1d_kernel_cpu),
+#endif
+	.model = &STARPUFFT(fft2_1d_model),
+	.nbuffers = 2
+};
+
+static starpu_codelet STARPUFFT(twist3_1d_codelet) = {
+	.where = CORE,
+	.core_func = STARPUFFT(twist3_1d_kernel_cpu),
+	.model = &STARPUFFT(twist3_1d_model),
+	.nbuffers = 1
+};
+
+STARPUFFT(plan)
+STARPUFFT(plan_dft_1d)(int n, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_1D;
+	int n2 = n / n1;
+	int n3;
+	int z;
+	struct starpu_task *task;
+
+	/*
+	 * Simple strategy:
+	 *
+	 * - twist1: twist input in n1 (n2) chunks
+	 * - fft1:   perform n1 (n2) ffts
+	 * - twist2: twist into n2 (n1) chunks distributed in
+	 *           DIV_1D groups
+	 * - fft2:   perform DIV_1D times n3 (n1) ffts
+	 * - twist3: twist back into output
+	 */
+
+#ifdef USE_CUDA
+	/* cufft 1D limited to 8M elements */
+	while (n2 > 8 << 20) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << I_BITS));
+
+	/* distribute the n2 second ffts into DIV_1D packages */
+	n3 = n2 / DIV_1D;
+	STARPU_ASSERT(n2 == n3*DIV_1D);
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* 4bit limitation in the tag space */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+
+	plan->dim = 1;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+	plan->totsize = n;
+	plan->totsize1 = n1;
+	plan->totsize2 = n2;
+	plan->totsize3 = DIV_1D;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+	plan->type = C2C;
+	plan->sign = sign;
+
+	compute_roots(plan);
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_get_worker_count(); workerid++) {
+		switch (starpu_get_worker_type(workerid)) {
+		case STARPU_CORE_WORKER:
+#ifdef HAVE_FFTW
+			/* first fft plan: one n2 fft */
+			plan->plans[workerid].in1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].in1, 0, plan->totsize2 * sizeof(_fftw_complex));
+			plan->plans[workerid].out1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].out1, 0, plan->totsize2 * sizeof(_fftw_complex));
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_1d)(n2, plan->plans[workerid].in1, plan->plans[workerid].out1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3 n1 ffts */
+			plan->plans[workerid].in2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].in2, 0, plan->totsize4 * sizeof(_fftw_complex));
+			plan->plans[workerid].out2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].out2, 0, plan->totsize4 * sizeof(_fftw_complex));
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3,
+					/* input */ plan->plans[workerid].in2, NULL, 1, plan->totsize1,
+					/* output */ plan->plans[workerid].out2, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+#else
+#warning libstarpufft can not work correctly without libfftw3
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+#ifdef USE_CUDA
+			plan->plans[workerid].initialized1 = 0;
+			plan->plans[workerid].initialized2 = 0;
+#endif
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+		}
+	}
+
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, i)
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+
+		/* Register (n2) chunks */
+		starpu_register_vector_data(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		starpu_register_vector_data(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need it on the CPU for the second twist anyway */
+		starpu_data_set_wb_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_1d_codelet);
+		//task->buffers[0].handle = to be filled at execution
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->twisted1_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_1d_codelet);
+		task->buffers[0].handle = plan->twisted1_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->fft1_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->buffers[2].handle = plan->roots_handle[0];
+		task->buffers[2].mode = STARPU_R;
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that to be done with first step we need to have
+		 * finished this fft1 */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, JOIN, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create join task */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, JOIN, 0);
+	task->use_tag = 1;
+	task->detach = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks */
+	for (z = 0; z < plan->totsize3; z++) {
+		int jj = z;
+#define STEP_TAG(step)	STEP_TAG_1D(plan, step, jj)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].jj = jj;
+
+		/* Register n3 (n1) chunks */
+		starpu_register_vector_data(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_register_vector_data(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need it on the CPU for the last twist anyway */
+		starpu_data_set_wb_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the whole first step to be
+		 * done */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_1D(plan, JOIN, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_1d_codelet);
+		task->buffers[0].handle = plan->twisted2_handle[z];
+		task->buffers[0].mode = STARPU_W;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_1d_codelet);
+		task->buffers[0].handle = plan->twisted2_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->fft2_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_1d_codelet);
+		task->buffers[0].handle = plan->fft2_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_1D(plan, END, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_1D(plan, END, 0);
+	task->use_tag = 1;
+	task->detach = 1;
+	task->destroy = 0;
+
+	return plan;
+}
+
+static starpu_tag_t
+STARPUFFT(start1dC2C)(STARPUFFT(plan) plan)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+
+	for (z=0; z < plan->totsize1; z++) {
+		starpu_submit_task(plan->twist1_tasks[z]);
+		starpu_submit_task(plan->fft1_tasks[z]);
+	}
+
+	starpu_submit_task(plan->join_task);
+
+	for (z=0; z < plan->totsize3; z++) {
+		starpu_submit_task(plan->twist2_tasks[z]);
+		starpu_submit_task(plan->fft2_tasks[z]);
+		starpu_submit_task(plan->twist3_tasks[z]);
+	}
+
+	starpu_submit_task(plan->end_task);
+
+	return STEP_TAG_1D(plan, END, 0);
+}
+
+static void
+STARPUFFT(free_1d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i;
+	int n1 = plan->n1[0];
+
+	for (i = 0; i < n1; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST1, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT1, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, JOIN, 0));
+
+	for (i = 0; i < DIV_1D; i++) {
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, FFT2, i));
+		starpu_tag_remove(STEP_TAG_1D(plan, TWIST3, i));
+	}
+
+	starpu_tag_remove(STEP_TAG_1D(plan, END, 0));
+}

+ 708 - 0
examples/starpufft/starpufftx2d.c

@@ -0,0 +1,708 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#define DIV_2D_N 8
+#define DIV_2D_M 8
+
+#define I_SHIFT (I_BITS/2)
+#define J_BITS I_SHIFT
+
+#define STEP_TAG_2D(plan, step, i, j) _STEP_TAG(plan, step, ((starpu_tag_t) i << I_SHIFT) | (starpu_tag_t) j)
+
+#ifdef USE_CUDA
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict twisted1 = (_cufftComplex *)descr[1].vector.ptr;
+
+	cudaStream_t stream = STARPUFFT(get_local_stream)(plan, starpu_get_worker_id());
+
+	STARPUFFT(cuda_twist1_2d_host)(in, twisted1, i, j, n1, n2, m1, m2, stream);
+	cudaStreamSynchronize(stream);
+}
+
+/* Perform an n2,m2 fft */
+static void
+STARPUFFT(fft1_2d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict out = (_cufftComplex *)descr[1].vector.ptr;
+	const _cufftComplex * restrict roots0 = (_cufftComplex *)descr[2].vector.ptr;
+	const _cufftComplex * restrict roots1 = (_cufftComplex *)descr[3].vector.ptr;
+
+	int workerid = starpu_get_worker_id();
+
+	cudaStream_t stream;
+
+	if (!plan->plans[workerid].initialized1) {
+		cures = cufftPlan2d(&plan->plans[workerid].plan1_cuda, n2, m2, _CUFFT_C2C);
+
+		stream = STARPUFFT(get_local_stream)(plan, workerid);
+		cufftSetStream(plan->plans[workerid].plan1_cuda, stream);
+
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		plan->plans[workerid].initialized1 = 1;
+	}
+
+	stream = plan->plans[workerid].stream;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan1_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	STARPU_ASSERT(cures == CUFFT_SUCCESS);
+
+	/* synchronization is done after the twiddling */
+	STARPUFFT(cuda_twiddle_2d_host)(out, roots0, roots1, n2, m2, i, j, stream);
+
+	cudaStreamSynchronize(stream);
+}
+
+static void
+STARPUFFT(fft2_2d_kernel_gpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int n;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)descr[0].vector.ptr;
+	_cufftComplex * restrict out = (_cufftComplex *)descr[1].vector.ptr;
+
+	int workerid = starpu_get_worker_id();
+
+	if (!plan->plans[workerid].initialized2) {
+		cures = cufftPlan2d(&plan->plans[workerid].plan2_cuda, n1, m1, _CUFFT_C2C);
+
+		cudaStream_t stream = STARPUFFT(get_local_stream)(plan, workerid);
+		cufftSetStream(plan->plans[workerid].plan2_cuda, stream);
+
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		plan->plans[workerid].initialized2 = 1;
+	}
+
+	for (n = 0; n < n3*m3; n++) {
+		cures = _cufftExecC2C(plan->plans[workerid].plan2_cuda, in + n * n1*m1, out + n * n1*m1, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+	}
+
+	cudaStreamSynchronize(plan->plans[workerid].stream);
+}
+#endif
+
+/* Twist the full vector into a n2,m2 chunk */
+static void
+STARPUFFT(twist1_2d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int m = plan->n[1];
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) * restrict twisted1 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	//printf("twist1 %d %d %g\n", i, j, (double) cabs(plan->in[i+j]));
+
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			twisted1[k*m2+l] = in[i*m+j+k*m*n1+l*m1];
+}
+
+#ifdef HAVE_FFTW
+/* Perform an n2,m2 fft */
+static void
+STARPUFFT(fft1_2d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int i = args->i;
+	int j = args->j;
+	int k, l;
+	int n2 = plan->n2[0];
+	int m2 = plan->n2[1];
+	int workerid = starpu_get_worker_id();
+
+	const STARPUFFT(complex) *twisted1 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) *fft1 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	_fftw_complex * restrict worker_in1 = (STARPUFFT(complex) *)plan->plans[workerid].in1;
+	_fftw_complex * restrict worker_out1 = (STARPUFFT(complex) *)plan->plans[workerid].out1;
+
+	//printf("fft1 %d %d %g\n", i, j, (double) cabs(twisted1[0]));
+
+	memcpy(worker_in1, twisted1, plan->totsize2 * sizeof(*worker_in1));
+	_FFTW(execute)(plan->plans[workerid].plan1_cpu);
+	for (k = 0; k < n2; k++)
+		for (l = 0; l < m2; l++)
+			fft1[k*m2 + l] = worker_out1[k*m2 + l] * plan->roots[0][i*k] * plan->roots[1][j*l];
+}
+#endif
+
+/* Twist the full vector into a package of n2/DIV_2D_N,m2/DIV_2D_M (n1,m1) chunks */
+static void
+STARPUFFT(twist2_2d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+
+	STARPUFFT(complex) * restrict twisted2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+
+	//printf("twist2 %d %d %g\n", kk, ll, (double) cabs(plan->fft1[kk+ll]));
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					twisted2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j] = plan->fft1[i*n1*n2*m2+j*n2*m2+k*m2+l];
+		}
+	}
+}
+
+#ifdef HAVE_FFTW
+/* Perform (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) ffts */
+static void
+STARPUFFT(fft2_2d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	//int kk = args->kk;
+	//int ll = args->ll;
+	int workerid = starpu_get_worker_id();
+
+	const STARPUFFT(complex) *twisted2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+	STARPUFFT(complex) *fft2 = (STARPUFFT(complex) *)descr[1].vector.ptr;
+
+	//printf("fft2 %d %d %g\n", kk, ll, (double) cabs(twisted2[plan->totsize4-1]));
+
+	_fftw_complex * restrict worker_in2 = (STARPUFFT(complex) *)plan->plans[workerid].in2;
+	_fftw_complex * restrict worker_out2 = (STARPUFFT(complex) *)plan->plans[workerid].out2;
+
+	memcpy(worker_in2, twisted2, plan->totsize4 * sizeof(*worker_in2));
+	_FFTW(execute)(plan->plans[workerid].plan2_cpu);
+	/* no twiddle */
+	memcpy(fft2, worker_out2, plan->totsize4 * sizeof(*worker_out2));
+}
+#endif
+
+/* Spread the package of (n2/DIV_2D_N)*(m2/DIV_2D_M) (n1,m1) chunks into the full vector */
+static void
+STARPUFFT(twist3_2d_kernel_cpu)(starpu_data_interface_t *descr, void *_args)
+{
+	struct STARPUFFT(args) *args = _args;
+	STARPUFFT(plan) plan = args->plan;
+	int kk = args->kk;	/* between 0 and DIV_2D_N */
+	int ll = args->ll;	/* between 0 and DIV_2D_M */
+	int kkk, lll;		/* beetween 0,0 and n3,m3 */
+	int i, j;
+	int n1 = plan->n1[0];
+	int n2 = plan->n2[0];
+	int m1 = plan->n1[1];
+	int m2 = plan->n2[1];
+	int n3 = n2/DIV_2D_N;
+	int m3 = m2/DIV_2D_M;
+	int m = plan->n[1];
+
+	const STARPUFFT(complex) * restrict fft2 = (STARPUFFT(complex) *)descr[0].vector.ptr;
+
+	//printf("twist3 %d %d %g\n", kk, ll, (double) cabs(fft2[0]));
+
+	for (kkk = 0; kkk < n3; kkk++) {
+		int k = kk * n3 + kkk;
+		for (lll = 0; lll < m3; lll++) {
+			int l = ll * m3 + lll;
+			for (i = 0; i < n1; i++)
+				for (j = 0; j < m1; j++)
+					plan->out[i*n2*m+j*m2+k*m+l] = fft2[kkk*m3*n1*m1+lll*n1*m1+i*m1+j];
+		}
+	}
+}
+
+struct starpu_perfmodel_t STARPUFFT(twist1_2d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist1_2d"
+};
+
+struct starpu_perfmodel_t STARPUFFT(fft1_2d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"fft1_2d"
+};
+
+struct starpu_perfmodel_t STARPUFFT(twist2_2d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist2_2d"
+};
+
+struct starpu_perfmodel_t STARPUFFT(fft2_2d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"fft2_2d"
+};
+
+struct starpu_perfmodel_t STARPUFFT(twist3_2d_model) = {
+	.type = HISTORY_BASED,
+	.symbol = TYPE"twist3_2d"
+};
+
+static starpu_codelet STARPUFFT(twist1_2d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+		CORE,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(twist1_2d_kernel_gpu),
+#endif
+	.core_func = STARPUFFT(twist1_2d_kernel_cpu),
+	.model = &STARPUFFT(twist1_2d_model),
+	.nbuffers = 2
+};
+
+static starpu_codelet STARPUFFT(fft1_2d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+#ifdef HAVE_FFTW
+		CORE|
+#endif
+		0,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(fft1_2d_kernel_gpu),
+#endif
+#ifdef HAVE_FFTW
+	.core_func = STARPUFFT(fft1_2d_kernel_cpu),
+#endif
+	.model = &STARPUFFT(fft1_2d_model),
+	.nbuffers = 4
+};
+
+static starpu_codelet STARPUFFT(twist2_2d_codelet) = {
+	.where = CORE,
+	.core_func = STARPUFFT(twist2_2d_kernel_cpu),
+	.model = &STARPUFFT(twist2_2d_model),
+	.nbuffers = 1
+};
+
+static starpu_codelet STARPUFFT(fft2_2d_codelet) = {
+	.where =
+#ifdef USE_CUDA
+		CUDA|
+#endif
+#ifdef HAVE_FFTW
+		CORE|
+#endif
+		0,
+#ifdef USE_CUDA
+	.cuda_func = STARPUFFT(fft2_2d_kernel_gpu),
+#endif
+#ifdef HAVE_FFTW
+	.core_func = STARPUFFT(fft2_2d_kernel_cpu),
+#endif
+	.model = &STARPUFFT(fft2_2d_model),
+	.nbuffers = 2
+};
+
+static starpu_codelet STARPUFFT(twist3_2d_codelet) = {
+	.where = CORE,
+	.core_func = STARPUFFT(twist3_2d_kernel_cpu),
+	.model = &STARPUFFT(twist3_2d_model),
+	.nbuffers = 1
+};
+
+STARPUFFT(plan)
+STARPUFFT(plan_dft_2d)(int n, int m, int sign, unsigned flags)
+{
+	int workerid;
+	int n1 = DIV_2D_N;
+	int n2 = n / n1;
+	int n3;
+	int m1 = DIV_2D_M;
+	int m2 = m / m1;
+	int m3;
+	int z;
+	struct starpu_task *task;
+
+	/*
+	 * Simple strategy:
+	 *
+	 * - twist1: twist input in n1*m1 (n2,m2) chunks
+	 * - fft1:   perform n1*m1 (n2,m2) ffts
+	 * - twist2: twist into n2*m2 (n1,m1) chunks distributed in
+	 *           DIV_2D_N*DIV_2D_M groups
+	 * - fft2:   perform DIV_2D_N*DIV_2D_M times n3*m3 (n1,m1) ffts
+	 * - twist3: twist back into output
+	 */
+
+#ifdef USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (n2 > 16384) {
+		n1 *= 2;
+		n2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(n == n1*n2);
+	STARPU_ASSERT(n1 < (1ULL << J_BITS));
+
+
+#ifdef USE_CUDA
+	/* cufft 2D-3D limited to [2,16384] */
+	while (m2 > 16384) {
+		m1 *= 2;
+		m2 /= 2;
+	}
+#endif
+	STARPU_ASSERT(m == m1*m2);
+	STARPU_ASSERT(m1 < (1ULL << J_BITS));
+
+	/* distribute the n2*m2 second ffts into DIV_2D_N*DIV_2D_M packages */
+	n3 = n2 / DIV_2D_N;
+	STARPU_ASSERT(n2 == n3*DIV_2D_N);
+	m3 = m2 / DIV_2D_M;
+	STARPU_ASSERT(m2 == m3*DIV_2D_M);
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+	plan->number = STARPU_ATOMIC_ADD(&starpufft_last_plan_number, 1) - 1;
+
+	/* 4bit limitation in the tag space */
+	STARPU_ASSERT(plan->number < (1ULL << NUMBER_BITS));
+
+	plan->dim = 2;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+	plan->n[1] = m;
+
+	check_dims(plan);
+
+	plan->n1 = malloc(plan->dim * sizeof(*plan->n1));
+	plan->n1[0] = n1;
+	plan->n1[1] = m1;
+	plan->n2 = malloc(plan->dim * sizeof(*plan->n2));
+	plan->n2[0] = n2;
+	plan->n2[1] = m2;
+	plan->totsize = n * m;
+	plan->totsize1 = n1 * m1;
+	plan->totsize2 = n2 * m2;
+	plan->totsize3 = DIV_2D_N * DIV_2D_M;
+	plan->totsize4 = plan->totsize / plan->totsize3;
+	plan->type = C2C;
+	plan->sign = sign;
+
+	compute_roots(plan);
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_get_worker_count(); workerid++) {
+		switch (starpu_get_worker_type(workerid)) {
+		case STARPU_CORE_WORKER:
+#ifdef HAVE_FFTW
+			/* first fft plan: one n2*m2 fft */
+			plan->plans[workerid].in1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].in1, 0, plan->totsize2 * sizeof(_fftw_complex));
+			plan->plans[workerid].out1 = _FFTW(malloc)(plan->totsize2 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].out1, 0, plan->totsize2 * sizeof(_fftw_complex));
+			plan->plans[workerid].plan1_cpu = _FFTW(plan_dft_2d)(n2, m2, plan->plans[workerid].in1, plan->plans[workerid].out1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan1_cpu);
+
+			/* second fft plan: n3*m3 n1*m1 ffts */
+			plan->plans[workerid].in2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].in2, 0, plan->totsize4 * sizeof(_fftw_complex));
+			plan->plans[workerid].out2 = _FFTW(malloc)(plan->totsize4 * sizeof(_fftw_complex));
+			memset(plan->plans[workerid].out2, 0, plan->totsize4 * sizeof(_fftw_complex));
+			plan->plans[workerid].plan2_cpu = _FFTW(plan_many_dft)(plan->dim,
+					plan->n1, n3*m3,
+					/* input */ plan->plans[workerid].in2, NULL, 1, plan->totsize1,
+					/* output */ plan->plans[workerid].out2, NULL, 1, plan->totsize1,
+					sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan2_cpu);
+#else
+#warning libstarpufft can not work correctly without libfftw3
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+#ifdef USE_CUDA
+			plan->plans[workerid].initialized1 = 0;
+			plan->plans[workerid].initialized2 = 0;
+#endif
+			break;
+		default:
+			STARPU_ASSERT(0);
+			break;
+		}
+	}
+
+	plan->twisted1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted1));
+	memset(plan->twisted1, 0, plan->totsize * sizeof(*plan->twisted1));
+	plan->fft1 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft1));
+	memset(plan->fft1, 0, plan->totsize * sizeof(*plan->fft1));
+	plan->twisted2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->twisted2));
+	memset(plan->twisted2, 0, plan->totsize * sizeof(*plan->twisted2));
+	plan->fft2 = STARPUFFT(malloc)(plan->totsize * sizeof(*plan->fft2));
+	memset(plan->fft2, 0, plan->totsize * sizeof(*plan->fft2));
+
+	plan->twisted1_handle = malloc(plan->totsize1 * sizeof(*plan->twisted1_handle));
+	plan->fft1_handle = malloc(plan->totsize1 * sizeof(*plan->fft1_handle));
+	plan->twisted2_handle = malloc(plan->totsize3 * sizeof(*plan->twisted2_handle));
+	plan->fft2_handle = malloc(plan->totsize3 * sizeof(*plan->fft2_handle));
+
+	plan->twist1_tasks = malloc(plan->totsize1 * sizeof(*plan->twist1_tasks));
+	plan->fft1_tasks = malloc(plan->totsize1 * sizeof(*plan->fft1_tasks));
+	plan->twist2_tasks = malloc(plan->totsize3 * sizeof(*plan->twist2_tasks));
+	plan->fft2_tasks = malloc(plan->totsize3 * sizeof(*plan->fft2_tasks));
+	plan->twist3_tasks = malloc(plan->totsize3 * sizeof(*plan->twist3_tasks));
+
+	plan->fft1_args = malloc(plan->totsize1 * sizeof(*plan->fft1_args));
+	plan->fft2_args = malloc(plan->totsize3 * sizeof(*plan->fft2_args));
+
+	/* Create first-round tasks */
+	for (z = 0; z < plan->totsize1; z++) {
+		int i = z / m1, j = z % m1;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, i, j)
+
+		plan->fft1_args[z].plan = plan;
+		plan->fft1_args[z].i = i;
+		plan->fft1_args[z].j = j;
+
+		/* Register (n2,m2) chunks */
+		starpu_register_vector_data(&plan->twisted1_handle[z], 0, (uintptr_t) &plan->twisted1[z*plan->totsize2], plan->totsize2, sizeof(*plan->twisted1));
+		starpu_register_vector_data(&plan->fft1_handle[z], 0, (uintptr_t) &plan->fft1[z*plan->totsize2], plan->totsize2, sizeof(*plan->fft1));
+
+		/* We'll need it on the CPU for the second twist anyway */
+		starpu_data_set_wb_mask(plan->fft1_handle[z], 1<<0);
+
+		/* Create twist1 task */
+		plan->twist1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist1_2d_codelet);
+		//task->buffers[0].handle = to be filled at execution
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->twisted1_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(TWIST1);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that fft1 depends on twisted1 */
+		starpu_tag_declare_deps(STEP_TAG(FFT1),
+				1, STEP_TAG(TWIST1));
+
+		/* Create FFT1 task */
+		plan->fft1_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft1_2d_codelet);
+		task->buffers[0].handle = plan->twisted1_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->fft1_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->buffers[2].handle = plan->roots_handle[0];
+		task->buffers[2].mode = STARPU_R;
+		task->buffers[3].handle = plan->roots_handle[1];
+		task->buffers[3].mode = STARPU_R;
+		task->cl_arg = &plan->fft1_args[z];
+		task->tag_id = STEP_TAG(FFT1);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that to be done with first step we need to have
+		 * finished this fft1 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, JOIN, 0, 0),
+				1, STEP_TAG(FFT1));
+#undef STEP_TAG
+	}
+
+	/* Create join task */
+	plan->join_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, JOIN, 0, 0);
+	task->use_tag = 1;
+	task->detach = 1;
+	task->destroy = 0;
+
+	/* Create second-round tasks */
+	for (z = 0; z < plan->totsize3; z++) {
+		int kk = z / DIV_2D_M, ll = z % DIV_2D_M;
+#define STEP_TAG(step)	STEP_TAG_2D(plan, step, kk, ll)
+
+		plan->fft2_args[z].plan = plan;
+		plan->fft2_args[z].kk = kk;
+		plan->fft2_args[z].ll = ll;
+
+		/* Register n3*m3 (n1,m1) chunks */
+		starpu_register_vector_data(&plan->twisted2_handle[z], 0, (uintptr_t) &plan->twisted2[z*plan->totsize4], plan->totsize4, sizeof(*plan->twisted2));
+		starpu_register_vector_data(&plan->fft2_handle[z], 0, (uintptr_t) &plan->fft2[z*plan->totsize4], plan->totsize4, sizeof(*plan->fft2));
+
+		/* We'll need it on the CPU for the last twist anyway */
+		starpu_data_set_wb_mask(plan->fft2_handle[z], 1<<0);
+
+		/* Tell that twisted2 depends on the whole first step to be
+		 * done */
+		starpu_tag_declare_deps(STEP_TAG(TWIST2),
+				1, STEP_TAG_2D(plan, JOIN, 0, 0));
+
+		/* Create twist2 task */
+		plan->twist2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist2_2d_codelet);
+		task->buffers[0].handle = plan->twisted2_handle[z];
+		task->buffers[0].mode = STARPU_W;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST2);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that fft2 depends on twisted2 */
+		starpu_tag_declare_deps(STEP_TAG(FFT2),
+				1, STEP_TAG(TWIST2));
+
+		/* Create FFT2 task */
+		plan->fft2_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(fft2_2d_codelet);
+		task->buffers[0].handle = plan->twisted2_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->buffers[1].handle = plan->fft2_handle[z];
+		task->buffers[1].mode = STARPU_W;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(FFT2);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that twist3 depends on fft2 */
+		starpu_tag_declare_deps(STEP_TAG(TWIST3),
+				1, STEP_TAG(FFT2));
+
+		/* Create twist3 tasks */
+		plan->twist3_tasks[z] = task = starpu_task_create();
+		task->cl = &STARPUFFT(twist3_2d_codelet);
+		task->buffers[0].handle = plan->fft2_handle[z];
+		task->buffers[0].mode = STARPU_R;
+		task->cl_arg = &plan->fft2_args[z];
+		task->tag_id = STEP_TAG(TWIST3);
+		task->use_tag = 1;
+		task->detach = 1;
+		task->destroy = 0;
+
+		/* Tell that to be completely finished we need to have finished this twisted3 */
+		starpu_tag_declare_deps(STEP_TAG_2D(plan, END, 0, 0),
+				1, STEP_TAG(TWIST3));
+#undef STEP_TAG
+	}
+
+	/* Create end task */
+	plan->end_task = task = starpu_task_create();
+	task->cl = NULL;
+	task->tag_id = STEP_TAG_2D(plan, END, 0, 0);
+	task->use_tag = 1;
+	task->detach = 1;
+	task->destroy = 0;
+
+	return plan;
+}
+
+static starpu_tag_t
+STARPUFFT(start2dC2C)(STARPUFFT(plan) plan)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+
+	for (z=0; z < plan->totsize1; z++) {
+		starpu_submit_task(plan->twist1_tasks[z]);
+		starpu_submit_task(plan->fft1_tasks[z]);
+	}
+
+	starpu_submit_task(plan->join_task);
+
+	for (z=0; z < plan->totsize3; z++) {
+		starpu_submit_task(plan->twist2_tasks[z]);
+		starpu_submit_task(plan->fft2_tasks[z]);
+		starpu_submit_task(plan->twist3_tasks[z]);
+	}
+
+	starpu_submit_task(plan->end_task);
+
+	return STEP_TAG_2D(plan, END, 0, 0);
+}
+
+static void
+STARPUFFT(free_2d_tags)(STARPUFFT(plan) plan)
+{
+	unsigned i, j;
+	int n1 = plan->n1[0];
+	int m1 = plan->n1[1];
+
+	for (i = 0; i < n1; i++) {
+		for (j = 0; j < m1; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST1, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT1, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, JOIN, 0, 0));
+
+	for (i = 0; i < DIV_2D_N; i++) {
+		for (j = 0; j < DIV_2D_M; j++) {
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, FFT2, i, j));
+			starpu_tag_remove(STEP_TAG_2D(plan, TWIST3, i, j));
+		}
+	}
+
+	starpu_tag_remove(STEP_TAG_2D(plan, END, 0, 0));
+}

+ 18 - 0
examples/starpufft/test.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx.c"

+ 18 - 0
examples/starpufft/test_threads.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "double.h"
+#include "testx_threads.c"

+ 18 - 0
examples/starpufft/testf.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx.c"

+ 18 - 0
examples/starpufft/testf_threads.c

@@ -0,0 +1,18 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "float.h"
+#include "testx_threads.c"

+ 228 - 0
examples/starpufft/testx.c

@@ -0,0 +1,228 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#undef USE_CUDA
+
+#ifdef HAVE_FFTW
+#include <fftw3.h>
+#endif
+#ifdef USE_CUDA
+#include <cufft.h>
+#endif
+
+#define SIGN (-1)
+//#define SIGN (1)
+
+int main(int argc, char *argv[]) {
+	int i;
+	struct timeval begin, end;
+	int size;
+	size_t bytes;
+	int n = 0, m = 0;
+	STARPUFFT(plan) plan;
+#ifdef HAVE_FFTW
+	_FFTW(plan) fftw_plan;
+#endif
+#ifdef USE_CUDA
+	cufftHandle cuda_plan;
+	cudaError_t cures;
+#endif
+	double timing;
+
+	if (argc < 2 || argc > 3) {
+		fprintf(stderr,"need one or two size of vector\n");
+		exit(EXIT_FAILURE);
+	}
+
+	starpu_init(NULL);
+
+	if (argc == 2) {
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	} else if (argc == 3) {
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	} else {
+		assert(0);
+	}
+
+	bytes = size * sizeof(STARPUFFT(complex));
+
+	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
+	srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = drand48() + I * drand48();
+
+	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));
+
+#ifdef HAVE_FFTW
+	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
+#endif
+
+#ifdef USE_CUDA
+	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
+#endif
+
+	if (argc == 2) {
+		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
+#ifdef HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef USE_CUDA
+		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
+			printf("erf\n");
+#endif
+
+	} else if (argc == 3) {
+		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
+#ifdef HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef USE_CUDA
+		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
+#endif
+	} else {
+		assert(0);
+	}
+
+#ifdef HAVE_FFTW
+	gettimeofday(&begin, NULL);
+	_FFTW(execute)(fftw_plan);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+#ifdef USE_CUDA
+	gettimeofday(&begin, NULL);
+	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
+		printf("erf2\n");
+	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
+		CUDA_REPORT_ERROR(cures);
+	gettimeofday(&end, NULL);
+	cufftDestroy(cuda_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
+#endif
+
+	STARPUFFT(execute)(plan, in, out);
+
+	STARPUFFT(showstats)(stdout);
+	STARPUFFT(destroy_plan)(plan);
+
+	starpu_shutdown();
+
+	printf("\n");
+#if 0
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
+	printf("\n\n");
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
+	printf("\n\n");
+#ifdef HAVE_FFTW
+	for (i = 0; i < 16; i++)
+		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
+	printf("\n\n");
+#endif
+#endif
+
+#ifdef HAVE_FFTW
+{
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++) {
+		double diff = cabs(out[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		return EXIT_FAILURE;
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		return EXIT_FAILURE;
+}
+#endif
+
+#ifdef USE_CUDA
+{
+	double max = 0., tot = 0., norm = 0., normdiff = 0.;
+	for (i = 0; i < size; i++) {
+		double diff = cabs(out_cuda[i]-out_fftw[i]);
+		double diff2 = diff * diff;
+		double size = cabs(out_fftw[i]);
+		double size2 = size * size;
+		if (diff > max)
+			max = diff;
+		tot += diff;
+		normdiff += diff2;
+		norm += size2;
+	}
+	fprintf(stderr, "\nmaximum difference %g\n", max);
+	fprintf(stderr, "average difference %g\n", tot / size);
+	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
+	double relmaxdiff = max / sqrt(norm);
+	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
+	double relavgdiff = (tot / size) / sqrt(norm);
+	fprintf(stderr, "relative average difference %g\n", relavgdiff);
+	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
+		return EXIT_FAILURE;
+	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
+		return EXIT_FAILURE;
+}
+#endif
+
+	STARPUFFT(free)(in);
+	STARPUFFT(free)(out);
+
+#ifdef HAVE_FFTW
+	STARPUFFT(free)(out_fftw);
+#endif
+
+#ifdef USE_CUDA
+	free(out_cuda);
+#endif
+
+	return EXIT_SUCCESS;
+}

+ 100 - 0
examples/starpufft/testx_threads.c

@@ -0,0 +1,100 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR in PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <complex.h>
+#include <math.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+
+#include <starpu_config.h>
+#include "starpufft.h"
+
+#include <fftw3.h>
+
+#define SIGN (-1)
+//#define SIGN (1)
+
+int main(int argc, char *argv[]) {
+	int i;
+	struct timeval begin, end;
+	int size;
+	size_t bytes;
+	int n = 0, m = 0;
+	_FFTW(plan) fftw_plan;
+	double timing;
+	char *num;
+	int num_threads = 1;
+
+	_FFTW(init_threads)();
+
+	num = getenv("NUM_THREADS");
+	if (num)
+		num_threads = atoi(num);
+	_FFTW(plan_with_nthreads)(num_threads);
+
+	if (argc < 2 || argc > 3) {
+		fprintf(stderr,"need one or two size of vector\n");
+		exit(EXIT_FAILURE);
+	}
+
+	if (argc == 2) {
+		n = atoi(argv[1]);
+
+		/* 1D */
+		size = n;
+	} else if (argc == 3) {
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+
+		/* 2D */
+		size = n * m;
+	} else {
+		assert(0);
+	}
+
+	bytes = size * sizeof(_FFTW(complex));
+
+	_FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in));
+	srand48(0);
+	for (i = 0; i < size; i++)
+		in[i] = drand48() + I * drand48();
+
+	_FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw));
+
+	if (argc == 2) {
+		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
+
+	} else if (argc == 3) {
+		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
+	} else {
+		assert(0);
+	}
+
+	gettimeofday(&begin, NULL);
+	_FFTW(execute)(fftw_plan);
+	gettimeofday(&end, NULL);
+	_FFTW(destroy_plan)(fftw_plan);
+	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
+	printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads));
+
+	printf("\n");
+
+	return EXIT_SUCCESS;
+}

+ 10 - 10
examples/strassen/strassen.c

@@ -89,51 +89,51 @@ static void unpartition_matrices(strassen_iter_state_t *iter)
 }
 
 static starpu_codelet cl_add = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_add_sub,
 	.core_func = add_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = add_cublas_codelet,
+	.cuda_func = add_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
 
 static starpu_codelet cl_sub = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_add_sub,
 	.core_func = sub_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = sub_cublas_codelet,
+	.cuda_func = sub_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
 
 static starpu_codelet cl_mult = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_mult,
 	.core_func = mult_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = mult_cublas_codelet,
+	.cuda_func = mult_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
 
 static starpu_codelet cl_self_add = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_self_add_sub,
 	.core_func = self_add_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = self_add_cublas_codelet,
+	.cuda_func = self_add_cublas_codelet,
 #endif
 	.nbuffers = 2
 };
 
 static starpu_codelet cl_self_sub = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_self_add_sub,
 	.core_func = self_sub_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = self_sub_cublas_codelet,
+	.cuda_func = self_sub_cublas_codelet,
 #endif
 	.nbuffers = 2
 };

+ 6 - 0
examples/strassen/strassen_kernels.c

@@ -42,6 +42,7 @@ static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attri
 			cublasSgemm('t', 'n', dx, dy, dz, 
 					-1.0f, right, ld12, left, ld21, 
 					 1.0f, center, ld22);
+			cudaThreadSynchronize();
 			break;
 #endif
 		default:
@@ -100,6 +101,8 @@ static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __at
 				/* add line B to C = A */
 				cublasSaxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
 			}
+			
+			cudaThreadSynchronize();
 
 			break;
 #endif
@@ -164,6 +167,9 @@ static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s,
 				/* add line A to C */
 				cublasSaxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
 			}
+			
+			cudaThreadSynchronize();
+
 			break;
 #endif
 		default:

+ 4 - 0
examples/strassen/test_strassen.c

@@ -176,12 +176,16 @@ int main(__attribute__ ((unused)) int argc,
 	/* start the runtime */
 	starpu_init(NULL);
 
+	starpu_helper_init_cublas();
+
 	sem_init(&sem, 0, 0U);
 
 	init_problem();
 	sem_wait(&sem);
 	sem_destroy(&sem);
 
+	starpu_helper_shutdown_cublas();
+
 	starpu_shutdown();
 
 	return 0;

+ 64 - 51
examples/strassen2/strassen2.c

@@ -24,12 +24,24 @@
 
 #include <starpu.h>
 
-
 #define MAXDEPS	4
 
 uint64_t current_tag = 1024;
 
 uint64_t used_mem = 0;
+uint64_t used_mem_predicted = 0;
+
+#define MAXREC	7
+
+/* the size consumed by the algorithm should be
+ *	<= (size)^2 * ( predicted_mem[rec] + 1)
+ * NB: we don't really need this, but this is useful to avoid allocating
+ * thousands of pinned buffers and as many VMA that pressure Linux a lot */
+static unsigned predicted_mem[7] = {
+	12, 29, 58, 110, 201, 361, 640
+};
+
+static unsigned char *bigbuffer;
 
 /*
 
@@ -151,30 +163,32 @@ static starpu_filter f2 =
 	.filter_arg = 2
 };
 
-starpu_data_handle allocate_tmp_matrix(unsigned size, unsigned reclevel)
+static float *allocate_tmp_matrix_wrapper(size_t size)
 {
-	starpu_data_handle *data = malloc(sizeof(starpu_data_handle));
 	float *buffer;
 
-#ifdef USE_CUDA
-        if (pin) {
-                starpu_malloc_pinned_if_possible((void **)&buffer, size*size*sizeof(float));
-        } else
-#endif
-        {
-#ifdef HAVE_POSIX_MEMALIGN
-		posix_memalign((void **)&buffer, 4096, size*size*sizeof(float));
-#else
-		buffer = malloc(size*size*sizeof(float));
-#endif
-        }
+	buffer = (float *)&bigbuffer[used_mem];
+
+	/* XXX there could be some extra alignment constraints here */
+	used_mem += size;
 
-	assert(buffer);
+	if (used_mem > used_mem_predicted)
+		fprintf(stderr, "used %ld predict %ld\n", used_mem, used_mem_predicted);
 
-	used_mem += size*size*sizeof(float);
+	assert(used_mem <= used_mem_predicted);
 
-	memset(buffer, 0, size*size*sizeof(float));
+	memset(buffer, 0, size);
 
+	return buffer;
+
+}
+
+static starpu_data_handle allocate_tmp_matrix(unsigned size, unsigned reclevel)
+{
+	starpu_data_handle *data = malloc(sizeof(starpu_data_handle));
+	float *buffer;
+
+	buffer = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
 
 	starpu_register_blas_data(data, 0, (uintptr_t)buffer, size, size, size, sizeof(float));
 
@@ -193,31 +207,31 @@ enum operation {
 };
 
 static starpu_codelet cl_add = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_add,
 	.core_func = add_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = add_cublas_codelet,
+	.cuda_func = add_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
 
 static starpu_codelet cl_sub = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_sub,
 	.core_func = sub_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = sub_cublas_codelet,
+	.cuda_func = sub_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
 
 static starpu_codelet cl_mult = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_mult,
 	.core_func = mult_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = mult_cublas_codelet,
+	.cuda_func = mult_cublas_codelet,
 #endif
 	.nbuffers = 3
 };
@@ -259,21 +273,21 @@ struct starpu_task *compute_add_sub_op(starpu_data_handle C, enum operation op,
 }
 
 static starpu_codelet cl_self_add = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_self_add,
 	.core_func = self_add_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = self_add_cublas_codelet,
+	.cuda_func = self_add_cublas_codelet,
 #endif
 	.nbuffers = 2
 };
 
 static starpu_codelet cl_self_sub = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = &strassen_model_self_sub,
 	.core_func = self_sub_core_codelet,
 #ifdef USE_CUDA
-	.cublas_func = self_sub_cublas_codelet,
+	.cuda_func = self_sub_cublas_codelet,
 #endif
 	.nbuffers = 2
 };
@@ -329,11 +343,11 @@ void cleanup_callback(void *_arg)
 }
 
 static starpu_codelet cleanup_codelet = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = NULL,
 	.core_func = null_codelet,
 #ifdef USE_CUDA
-	.cublas_func = null_codelet,
+	.cuda_func = null_codelet,
 #endif
 	.nbuffers = 0
 };
@@ -705,11 +719,11 @@ static void dummy_codelet_func(__attribute__((unused))starpu_data_interface_t *d
 }
 
 static starpu_codelet dummy_codelet = {
-	.where = CORE|CUBLAS,
+	.where = CORE|CUDA,
 	.model = NULL,
 	.core_func = dummy_codelet_func,
 	#ifdef USE_CUDA
-	.cublas_func = dummy_codelet_func,
+	.cuda_func = dummy_codelet_func,
 	#endif
 	.nbuffers = 0
 };
@@ -762,36 +776,33 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
+	assert(reclevel <= MAXREC);
+
+	/* this is an upper bound ! */
+	used_mem_predicted = size*size*(predicted_mem[reclevel] + 1);
+
+	fprintf(stderr, "(Predicted) Memory consumption: %ld MB\n", used_mem_predicted/(1024*1024));
+
 	starpu_init(NULL);
 
+	starpu_helper_init_cublas();
+
 #ifdef USE_CUDA
         if (pin) {
-                starpu_malloc_pinned_if_possible((void **)&A, size*size*sizeof(float));
-                starpu_malloc_pinned_if_possible((void **)&B, size*size*sizeof(float));
-                starpu_malloc_pinned_if_possible((void **)&C, size*size*sizeof(float));
+                starpu_malloc_pinned_if_possible((void **)&bigbuffer, used_mem_predicted);
         } else
 #endif
         {
 #ifdef HAVE_POSIX_MEMALIGN
-                posix_memalign((void **)&A, 4096, size*size*sizeof(float));
-                posix_memalign((void **)&B, 4096, size*size*sizeof(float));
-                posix_memalign((void **)&C, 4096, size*size*sizeof(float));
+                posix_memalign((void **)&bigbuffer, 4096, used_mem_predicted);
 #else
-		A = malloc(size*size*sizeof(float));
-		B = malloc(size*size*sizeof(float));
-		C = malloc(size*size*sizeof(float));
+		bigbuffer = malloc(used_mem_predicted);
 #endif
-        }
-
-	assert(A);
-	assert(B);
-	assert(C);
-
-	used_mem += 3*size*size*sizeof(float);
+	}
 
-	memset(A, 0, size*size*sizeof(float));
-	memset(B, 0, size*size*sizeof(float));
-	memset(C, 0, size*size*sizeof(float));
+	A = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
+	B = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
+	C = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
 
 	starpu_register_blas_data(&data_A, 0, (uintptr_t)A, size, size, size, sizeof(float));
 	starpu_register_blas_data(&data_B, 0, (uintptr_t)B, size, size, size, sizeof(float));
@@ -833,6 +844,8 @@ int main(int argc, char **argv)
 
 	gettimeofday(&end, NULL);
 
+	starpu_helper_shutdown_cublas();
+
 	starpu_shutdown();
 
 	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

+ 24 - 0
examples/strassen2/strassen2_kernels.c

@@ -65,6 +65,10 @@ static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attri
 
 	double flop = 2.0*n*n*n;
 
+#ifdef USE_CUDA
+	cublasStatus cublasres;
+#endif
+
 	switch (s) {
 		case 0:
 			cpus_flop += flop;
@@ -73,7 +77,11 @@ static void mult_common_codelet(starpu_data_interface_t *buffers, int s, __attri
 #ifdef USE_CUDA
 		case 1:
 			cublas_flop += flop;
+
 			cublasSgemm('n', 'n', n, n, n, 1.0f, right, ld12, left, ld21, 0.0f, center, ld22);
+			cublasres = cublasGetError();
+			if (STARPU_UNLIKELY(cublasres))
+				CUBLAS_REPORT_ERROR(cublasres);
 			break;
 #endif
 		default:
@@ -113,6 +121,9 @@ static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __at
 	// TODO check dim ...
 
 	unsigned line;
+#ifdef USE_CUDA
+	cublasStatus cublasres;
+#endif
 
 	switch (s) {
 		case 0:
@@ -132,8 +143,14 @@ static void add_sub_common_codelet(starpu_data_interface_t *buffers, int s, __at
 			{
 				/* copy line A into C */
 				cublasSaxpy(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
+				cublasres = cublasGetError();
+				if (STARPU_UNLIKELY(cublasres))
+					CUBLAS_REPORT_ERROR(cublasres);
 				/* add line B to C = A */
 				cublasSaxpy(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
+				cublasres = cublasGetError();
+				if (STARPU_UNLIKELY(cublasres))
+					CUBLAS_REPORT_ERROR(cublasres);
 			}
 
 			break;
@@ -185,6 +202,10 @@ static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s,
 	
 	unsigned line;
 
+#ifdef USE_CUDA
+	cublasStatus cublasres;
+#endif
+
 	switch (s) {
 		case 0:
 			cpus_flop += flop;
@@ -201,6 +222,9 @@ static void self_add_sub_common_codelet(starpu_data_interface_t *buffers, int s,
 			{
 				/* add line A to C */
 				cublasSaxpy(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
+				cublasres = cublasGetError();
+				if (STARPU_UNLIKELY(cublasres))
+					CUBLAS_REPORT_ERROR(cublasres);
 			}
 			break;
 #endif

+ 4 - 3
examples/tag_example/tag_example.c

@@ -150,7 +150,8 @@ void callback_core(void *argcb __attribute__ ((unused)))
 	}
 }
 
-void core_codelet(void *_args __attribute__ ((unused)))
+void core_codelet(starpu_data_interface_t *buffers __attribute__((unused)),
+			void *_args __attribute__ ((unused)))
 {
 //	printf("execute task\n");
 }
@@ -198,9 +199,9 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 
 	fprintf(stderr, "ITER: %d\n", nk);
 
-	cl.where = ANY;
+	cl.where = CORE|CUDA|GORDON;
 	cl.core_func = core_codelet;
-	cl.cublas_func = core_codelet;
+	cl.cuda_func = core_codelet;
 #ifdef USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif

+ 4 - 3
examples/tag_example/tag_example2.c

@@ -100,7 +100,8 @@ static void create_task_grid(unsigned iter)
 
 }
 
-void core_codelet(void *_args __attribute__ ((unused)))
+void core_codelet(starpu_data_interface_t *buffers __attribute__ ((unused)),
+			void *_args __attribute__ ((unused)))
 {
 }
 
@@ -118,11 +119,11 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	parse_args(argc, argv);
 
 	cl.core_func = core_codelet;
-	cl.cublas_func = core_codelet;
+	cl.cuda_func = core_codelet;
 #ifdef USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
-	cl.where = ANY;
+	cl.where = CORE|CUDA|GORDON;
 	cl.nbuffers = 0;
 
 	fprintf(stderr, "ITER : %d\n", nk);

+ 2 - 2
examples/tag_example/tag_example3.c

@@ -118,11 +118,11 @@ int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
 	parse_args(argc, argv);
 
 	cl.core_func = core_codelet;
-	cl.cublas_func = core_codelet;
+	cl.cuda_func = core_codelet;
 #ifdef USE_GORDON
 	cl.gordon_func = gordon_null_kernel;
 #endif
-	cl.where = ANY;
+	cl.where = CORE|CUDA|GORDON;
 	cl.nbuffers = 0;
 
 	fprintf(stderr, "ITER : %d\n", nk);

+ 156 - 0
examples/tag_example/tag_restartable.c

@@ -0,0 +1,156 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <semaphore.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include <starpu.h>
+
+#ifdef USE_GORDON
+#include <gordon/null.h>
+#endif
+
+#define Nrolls	4
+#define SLEEP 1
+
+#define TAG(i, iter)	((starpu_tag_t)  (((uint64_t)((iter)%Nrolls))<<32 | (i)) )
+
+starpu_codelet cl;
+
+#define Ni	64
+#define Nk	256
+
+static unsigned ni = Ni, nk = Nk;
+static unsigned callback_cnt;
+struct starpu_task **tasks[Nrolls];
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-iter") == 0) {
+		        char *argptr;
+			nk = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-i") == 0) {
+		        char *argptr;
+			ni = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-h") == 0) {
+			printf("usage : %s [-iter iter] [-i i]\n", argv[0]);
+		}
+	}
+}
+
+void callback_core(void *argcb);
+
+static void create_task_grid(unsigned iter)
+{
+	unsigned i;
+
+	fprintf(stderr, "init iter %d ni %d...\n", iter, ni);
+
+	callback_cnt = (ni);
+
+	for (i = 0; i < ni; i++)
+	{
+		/* create a new task */
+		struct starpu_task *task = tasks[iter][i] = starpu_task_create();
+
+		task->cl = &cl;
+		task->cl_arg = (void*)(uintptr_t) (i | (iter << 16));
+
+		task->use_tag = 1;
+		task->tag_id = TAG(i, iter);
+
+		task->detach = 1;
+		task->destroy = 0;
+
+		if (i != 0)
+			starpu_tag_declare_deps(TAG(i,iter), 1, TAG(i-1,iter));
+	}
+
+}
+
+static void start_task_grid(unsigned iter)
+{
+	unsigned i;
+
+	//fprintf(stderr, "start grid %d ni %d...\n", iter, ni);
+
+	for (i = 0; i < ni; i++)
+		starpu_submit_task(tasks[iter][i]);
+}
+
+void core_codelet(starpu_data_interface_t *descr, void *_args __attribute__((unused)))
+{
+	//int i = (uintptr_t) _args;
+	//printf("doing %x\n", i);
+	//usleep(SLEEP);
+	//printf("done %x\n", i);
+}
+
+int main(int argc __attribute__((unused)) , char **argv __attribute__((unused)))
+{
+	unsigned i;
+
+	starpu_init(NULL);
+
+#ifdef USE_GORDON
+	/* load an empty kernel and get its identifier */
+	unsigned gordon_null_kernel = load_gordon_null_kernel();
+#endif
+
+	parse_args(argc, argv);
+
+	cl.core_func = core_codelet;
+	cl.cuda_func = core_codelet;
+#ifdef USE_GORDON
+	cl.gordon_func = gordon_null_kernel;
+#endif
+	cl.where = CORE|CUDA|GORDON;
+	cl.nbuffers = 0;
+
+	fprintf(stderr, "ITER : %d\n", nk);
+
+	for (i = 0; i < Nrolls; i++) {
+		tasks[i] = malloc(ni * sizeof(*tasks[i]));
+
+		create_task_grid(i);
+	}
+
+	for (i = 0; i < nk; i++)
+	{
+		start_task_grid(i % Nrolls);
+
+		if (i+1 >= Nrolls)
+			/* Wait before re-using same tasks & tags */
+			starpu_tag_wait(TAG(ni-1, i + 1));
+	}
+
+	starpu_shutdown();
+
+	fprintf(stderr, "TEST DONE ...\n");
+
+	return 0;
+}

+ 8 - 0
include/starpu-data-filters.h

@@ -19,6 +19,10 @@
 
 #include <starpu_config.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct starpu_data_state_t;
 
 typedef struct starpu_filter_t {
@@ -50,4 +54,8 @@ unsigned starpu_block_filter_func_vector(starpu_filter *f, struct starpu_data_st
 unsigned starpu_list_filter_func_vector(starpu_filter *f, struct starpu_data_state_t *root_data);
 unsigned starpu_divide_in_2_filter_func_vector(starpu_filter *f, struct starpu_data_state_t *root_data);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif

+ 11 - 0
include/starpu-data-interfaces.h

@@ -17,6 +17,12 @@
 #ifndef __STARPU_DATA_INTERFACES_H__
 #define __STARPU_DATA_INTERFACES_H__
 
+#include <starpu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct starpu_data_state_t;
 typedef struct starpu_data_state_t * starpu_data_handle;
 
@@ -93,6 +99,7 @@ uint32_t starpu_get_csr_firstentry(starpu_data_handle handle);
 uintptr_t starpu_get_csr_local_nzval(starpu_data_handle handle);
 uint32_t *starpu_get_csr_local_colind(starpu_data_handle handle);
 uint32_t *starpu_get_csr_local_rowptr(starpu_data_handle handle);
+size_t starpu_get_csr_elemsize(struct starpu_data_state_t *state);
 
 /* CSC interface for sparse matrices (compressed sparse column representation) */
 typedef struct starpu_csc_interface_s {
@@ -141,6 +148,7 @@ uint32_t *starpu_get_bcsr_local_colind(starpu_data_handle);
 uint32_t *starpu_get_bcsr_local_rowptr(starpu_data_handle);
 uint32_t starpu_get_bcsr_r(starpu_data_handle);
 uint32_t starpu_get_bcsr_c(starpu_data_handle);
+size_t starpu_get_bcsr_elemsize(struct starpu_data_state_t *state);
 
 typedef union {
 	starpu_blas_interface_t blas;	/* dense BLAS representation */
@@ -152,5 +160,8 @@ typedef union {
 	uint8_t pad[64];
 } starpu_data_interface_t;
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_DATA_INTERFACES_H__

+ 22 - 1
include/starpu-data.h

@@ -21,7 +21,11 @@
 #include <starpu-data-interfaces.h>
 #include <starpu-data-filters.h>
 
-#define NMAXBUFS        8
+#define STARPU_NMAXBUFS        8
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct starpu_data_state_t;
 
@@ -45,5 +49,22 @@ void starpu_sync_data_with_mem(struct starpu_data_state_t *state);
 void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
 
 void starpu_malloc_pinned_if_possible(void **A, size_t dim);
+void starpu_free_pinned_if_possible(void *A);
+
+int starpu_request_data_allocation(struct starpu_data_state_t *state, uint32_t node);
+
+void starpu_prefetch_data_on_node(struct starpu_data_state_t *state, unsigned node, unsigned async);
+
+unsigned starpu_get_worker_memory_node(unsigned workerid);
+
+/* It is possible to associate a mask to a piece of data (and its children) so
+ * that when it is modified, it is automatically transfered into those memory
+ * node. For instance a (1<<0) write-back mask means that the CUDA workers will
+ * commit their changes in main memory (node 0). */
+void starpu_data_set_wb_mask(struct starpu_data_state_t *state, uint32_t wb_mask);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_DATA_H__

+ 12 - 2
include/starpu-perfmodel.h

@@ -21,6 +21,10 @@
 #include <pthread.h>
 #include <starpu_config.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct starpu_htbl32_node_s;
 struct starpu_history_list_t;
 struct starpu_buffer_descr_t;
@@ -33,12 +37,15 @@ struct starpu_buffer_descr_t;
 
 /* on most system we will consider one or two architectures as all accelerators
    are likely to be identical */
-#define NARCH_VARIATIONS	3
+#define NARCH_VARIATIONS	6
 
 enum starpu_perf_archtype {
 	STARPU_CORE_DEFAULT = 0,
 	STARPU_CUDA_DEFAULT = 1,
-	STARPU_GORDON_DEFAULT = 2
+	STARPU_CUDA_2 = 2,
+	STARPU_CUDA_3 = 3,
+	STARPU_CUDA_4 = 4,
+	STARPU_GORDON_DEFAULT = 5
 };
 
 
@@ -102,5 +109,8 @@ int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *mod
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
 		enum starpu_perf_archtype arch, char **path, size_t maxlen);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_PERFMODEL_H__

+ 90 - 58
include/starpu-task.h

@@ -19,6 +19,7 @@
 
 #include <errno.h>
 #include <starpu_config.h>
+#include <starpu.h>
 
 /* this is a randomly choosen value ... */
 #ifndef MAXCUDADEVS
@@ -31,9 +32,7 @@
 
 #include <starpu-data.h>
 
-#define ANY	(~0)
 #define CORE	((1ULL)<<1)
-#define CUBLAS	((1ULL)<<2)
 #define CUDA	((1ULL)<<3)
 #define SPU	((1ULL)<<4)
 #define GORDON	((1ULL)<<5)
@@ -42,6 +41,10 @@
 #define MAX_PRIO        5
 #define DEFAULT_PRIO	0
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef uint64_t starpu_tag_t;
 
 /*
@@ -53,24 +56,27 @@ typedef struct starpu_codelet_t {
 	uint32_t where;
 
 	/* the different implementations of the codelet */
-	void *cuda_func;
-	void *cublas_func;
-	void *core_func;
-	void *spu_func;
+	void (*cuda_func)(starpu_data_interface_t *, void *);
+	void (*core_func)(starpu_data_interface_t *, void *);
 	uint8_t gordon_func;
 
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 
 	struct starpu_perfmodel_t *model;
+
+	/* statistics collected at runtime: this is filled by StarPU and should
+	 * not be accessed directly (use the starpu_display_codelet_stats
+	 * function instead for instance). */
+	unsigned long per_worker_stats[STARPU_NMAXWORKERS];
 } starpu_codelet;
 
 struct starpu_task {
 	struct starpu_codelet_t *cl;
 
 	/* arguments managed by the DSM */
-	struct starpu_buffer_descr_t buffers[NMAXBUFS];
-	starpu_data_interface_t interface[NMAXBUFS];
+	struct starpu_buffer_descr_t buffers[STARPU_NMAXBUFS];
+	starpu_data_interface_t interface[STARPU_NMAXBUFS];
 
 	/* arguments not managed by the DSM are given as a buffer */
 	void *cl_arg;
@@ -89,76 +95,102 @@ struct starpu_task {
 	int priority; /* MAX_PRIO = most important 
         		: MIN_PRIO = least important */
 
-	/* should the task be automatically liberated once executed ? */
-	int cleanup;
-
-	/* this is private to StarPU, do not modify */
+	/* in case the task has to be executed on a specific worker */
+	unsigned execute_on_a_specific_worker;
+	unsigned workerid;
+
+	/* If this flag is set, it is not possible to synchronize with the task
+	 * by the means of starpu_wait_task later on. Internal data structures
+	 * are only garanteed to be liberated once starpu_wait_task is called
+	 * if that flag is not set. */
+	int detach;
+
+	/* If that flag is set, the task structure will automatically be
+	 * liberated, either after the execution of the callback if the task is
+	 * detached, or during starpu_task_wait otherwise. If this flag is not
+	 * set, dynamically allocated data structures will not be liberated
+	 * until starpu_task_destroy is called explicitely. Setting this flag
+	 * for a statically allocated task structure will result in undefined
+	 * behaviour. */
+	int destroy;
+
+	/* this is private to StarPU, do not modify. If the task is allocated
+	 * by hand (without starpu_task_create), this field should be set to
+	 * NULL. */
 	void *starpu_private;
 };
 
-#ifdef USE_CUDA
-/* CUDA specific codelets */
-typedef struct starpu_cuda_module_s {
-	CUmodule module;
-	char *module_path;
-	unsigned is_loaded[MAXCUDADEVS];
-} starpu_cuda_module_t;
-
-typedef struct starpu_cuda_function_s {
-	struct starpu_cuda_module_s *module;
-	CUfunction function;
-	char *symbol;
-	unsigned is_loaded[MAXCUDADEVS];
-} starpu_cuda_function_t;
-
-typedef struct starpu_cuda_codelet_s {
-	/* which function to execute on the card ? */
-	struct starpu_cuda_function_s *func;
-
-	/* grid and block shapes */
-	unsigned gridx;
-	unsigned gridy;
-	unsigned blockx;
-	unsigned blocky;
-
-	unsigned shmemsize;
-
-	void *stack; /* arguments */
-	size_t stack_size;
-} starpu_cuda_codelet_t;
-
-void starpu_init_cuda_module(struct starpu_cuda_module_s *module, char *path);
-void starpu_load_cuda_module(int devid, struct starpu_cuda_module_s *module);
-void starpu_init_cuda_function(struct starpu_cuda_function_s *func,
-                        struct starpu_cuda_module_s *module,
-                        char *symbol);
-void starpu_load_cuda_function(int devid, struct starpu_cuda_function_s *function);
-#endif // USE_CUDA
-
-/* handle task dependencies: it is possible to associate a task with a unique
- * "tag" and to express dependencies among tasks by the means of those tags */
-void starpu_tag_remove(starpu_tag_t id);
+/* It is possible to initialize statically allocated tasks with this value.
+ * This is equivalent to initializing a starpu_task structure with the
+ * starpu_task_init function. */
+#define STARPU_TASK_INITIALIZER 			\
+{							\
+	.cl = NULL,					\
+	.cl_arg = NULL,					\
+	.cl_arg_size = 0,				\
+	.callback_func = NULL,				\
+	.callback_arg = NULL,				\
+	.priority = DEFAULT_PRIO,			\
+	.use_tag = 0,					\
+	.synchronous = 0,				\
+	.execute_on_a_specific_worker = 0,		\
+	.detach = 1,					\
+	.destroy = 0,					\
+	.starpu_private = NULL				\
+};
+
+/*
+ * handle task dependencies: it is possible to associate a task with a unique
+ * "tag" and to express dependencies between tasks by the means of those tags
+ *
+ * To do so, fill the tag_id field with a tag number (can be arbitrary) and set
+ * use_tag to 1.
+ *
+ * If starpu_tag_declare_deps is called with that tag number, the task will not
+ * be started until the task which wears the declared dependency tags are
+ * complete.
+ */
 
 /*
  * WARNING ! use with caution ...
  *  In case starpu_tag_declare_deps is passed constant arguments, the caller
- *  must make sure that the constants have the same size as starpu_tag_t.
- *  Otherwise, nothing prevents the C compiler to consider the tag 0x20000003
- *  instead of 0x2 and 0x3 when calling:
+ *  must make sure that the constants are casted to starpu_tag_t. Otherwise,
+ *  due to integer sizes and argument passing on the stack, the C compiler
+ *  might consider the tag *  0x200000003 instead of 0x2 and 0x3 when calling:
  *      "starpu_tag_declare_deps(0x1, 2, 0x2, 0x3)"
  *  Using starpu_tag_declare_deps_array is a way to avoid this problem.
  */
+/* make id depend on the list of ids */
 void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);
 void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);
 
 void starpu_tag_wait(starpu_tag_t id);
 void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);
 
-/* it is possible that the application use tags explicitely */
+/* The application can feed a tag explicitely */
 void starpu_tag_notify_from_apps(starpu_tag_t id);
 
+/* To release resources, tags should be freed after use */
+void starpu_tag_remove(starpu_tag_t id);
+
+void starpu_task_init(struct starpu_task *task);
 struct starpu_task *starpu_task_create(void);
+void starpu_task_destroy(struct starpu_task *task);
 int starpu_submit_task(struct starpu_task *task);
 
+/* This function blocks until the task was executed. It is not possible to
+ * synchronize with a task more than once. It is not possible to wait
+ * synchronous or detached tasks.
+ * Upon successful completion, this function returns 0. Otherwise, -EINVAL
+ * indicates that the waited task was either synchronous or detached. */
+int starpu_wait_task(struct starpu_task *task);
+
+void starpu_wait_all_tasks(void);
+
+void starpu_display_codelet_stats(struct starpu_codelet_t *cl);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_TASK_H__

+ 154 - 2
include/starpu-util.h

@@ -23,6 +23,16 @@
 #include <assert.h>
 #include <starpu_config.h>
 
+#ifdef USE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <cublas.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define STARPU_MIN(a,b)	((a)<(b)?(a):(b))
 #define STARPU_MAX(a,b)	((a)<(b)?(b):(a))
 
@@ -31,18 +41,150 @@
 #define STARPU_UNLIKELY(expr)          (__builtin_expect(!!(expr),0))
 #define STARPU_LIKELY(expr)            (__builtin_expect(!!(expr),1))
 
-#ifdef HAVE_SYNC_BUILTINS
+#ifdef HAVE_SYNC_FETCH_AND_ADD
 #define STARPU_ATOMIC_ADD(ptr, value)  (__sync_fetch_and_add ((ptr), (value)) + (value))
 #define STARPU_ATOMIC_OR(ptr, value)  (__sync_fetch_and_or ((ptr), (value)))
 #else
 #error __sync_fetch_and_add is not available
 #endif
 
+#ifdef USE_CUDA
+
+#define CUBLAS_REPORT_ERROR(status) 					\
+	do {								\
+		char *errormsg;						\
+		switch (status) {					\
+			case CUBLAS_STATUS_SUCCESS:			\
+				errormsg = "success";			\
+				break;					\
+			case CUBLAS_STATUS_NOT_INITIALIZED:		\
+				errormsg = "not initialized";		\
+				break;					\
+			case CUBLAS_STATUS_ALLOC_FAILED:		\
+				errormsg = "alloc failed";		\
+				break;					\
+			case CUBLAS_STATUS_INVALID_VALUE:		\
+				errormsg = "invalid value";		\
+				break;					\
+			case CUBLAS_STATUS_ARCH_MISMATCH:		\
+				errormsg = "arch mismatch";		\
+				break;					\
+			case CUBLAS_STATUS_EXECUTION_FAILED:		\
+				errormsg = "execution failed";		\
+				break;					\
+			case CUBLAS_STATUS_INTERNAL_ERROR:		\
+				errormsg = "internal error";		\
+				break;					\
+			default:					\
+				errormsg = "unknown error";		\
+				break;					\
+		}							\
+		printf("oops  in %s ... %s \n", __func__, errormsg);	\
+		assert(0);						\
+	} while (0)  
+
+
+
+#define CUDA_REPORT_ERROR(status) 					\
+	do {								\
+		char *errormsg;						\
+		switch (status) {					\
+			case CUDA_SUCCESS:				\
+				errormsg = "success";			\
+				break;					\
+			case CUDA_ERROR_INVALID_VALUE:			\
+				errormsg = "invalid value";		\
+				break;					\
+			case CUDA_ERROR_OUT_OF_MEMORY:			\
+				errormsg = "out of memory";		\
+				break;					\
+			case CUDA_ERROR_NOT_INITIALIZED:		\
+				errormsg = "not initialized";		\
+				break;					\
+			case CUDA_ERROR_DEINITIALIZED:			\
+				errormsg = "deinitialized";		\
+				break;					\
+			case CUDA_ERROR_NO_DEVICE:			\
+				errormsg = "no device";			\
+				break;					\
+			case CUDA_ERROR_INVALID_DEVICE:			\
+				errormsg = "invalid device";		\
+				break;					\
+			case CUDA_ERROR_INVALID_IMAGE:			\
+				errormsg = "invalid image";		\
+				break;					\
+			case CUDA_ERROR_INVALID_CONTEXT:		\
+				errormsg = "invalid context";		\
+				break;					\
+			case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:	\
+				errormsg = "context already current";	\
+				break;					\
+			case CUDA_ERROR_MAP_FAILED:			\
+				errormsg = "map failed";		\
+				break;					\
+			case CUDA_ERROR_UNMAP_FAILED:			\
+				errormsg = "unmap failed";		\
+				break;					\
+			case CUDA_ERROR_ARRAY_IS_MAPPED:		\
+				errormsg = "array is mapped";		\
+				break;					\
+			case CUDA_ERROR_ALREADY_MAPPED:			\
+				errormsg = "already mapped";		\
+				break;					\
+			case CUDA_ERROR_NO_BINARY_FOR_GPU:		\
+				errormsg = "no binary for gpu";		\
+				break;					\
+			case CUDA_ERROR_ALREADY_ACQUIRED:		\
+				errormsg = "already acquired";		\
+				break;					\
+			case CUDA_ERROR_NOT_MAPPED:			\
+				errormsg = "not mapped";		\
+				break;					\
+			case CUDA_ERROR_INVALID_SOURCE:			\
+				errormsg = "invalid source";		\
+				break;					\
+			case CUDA_ERROR_FILE_NOT_FOUND:			\
+				errormsg = "file not found";		\
+				break;					\
+			case CUDA_ERROR_INVALID_HANDLE:			\
+				errormsg = "invalid handle";		\
+				break;					\
+			case CUDA_ERROR_NOT_FOUND:			\
+				errormsg = "not found";			\
+				break;					\
+			case CUDA_ERROR_NOT_READY:			\
+				errormsg = "not ready";			\
+				break;					\
+			case CUDA_ERROR_LAUNCH_FAILED:			\
+				errormsg = "launch failed";		\
+				break;					\
+			case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:	\
+				errormsg = "launch out of resources";	\
+				break;					\
+			case CUDA_ERROR_LAUNCH_TIMEOUT:			\
+				errormsg = "launch timeout";		\
+				break;					\
+			case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:	\
+				errormsg = "launch incompatible texturing";\
+				break;					\
+			case CUDA_ERROR_UNKNOWN:			\
+			default:					\
+				errormsg = "unknown error";		\
+				break;					\
+		}							\
+		printf("oops  in %s ... %s \n", __func__, errormsg);	\
+		assert(0);						\
+	} while (0)  
+
+#endif // USE_CUDA
+
+
+
 #define STARPU_SUCCESS	0
 #define STARPU_TRYAGAIN	1
 #define STARPU_FATAL	2
 
-static int __attribute__ ((unused)) starpu_get_env_number(const char *str)
+static inline int starpu_get_env_number(const char *str)
 {
 	char *strval;
 
@@ -65,4 +207,14 @@ static int __attribute__ ((unused)) starpu_get_env_number(const char *str)
 	}
 }
 
+void starpu_trace_user_event(unsigned code);
+
+/* Some helper functions for application using CUBLAS kernels */
+void starpu_helper_init_cublas(void);
+void starpu_helper_shutdown_cublas(void);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif // __STARPU_UTIL_H__

+ 0 - 0
include/starpu.h


Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio