Nathalie Furmento 12 роки тому
батько
коміт
22b0946858

+ 0 - 342
build-aux/compile

@@ -1,342 +0,0 @@
-#! /bin/sh
-# Wrapper for compilers which do not understand '-c -o'.
-
-scriptversion=2012-03-05.13; # UTC
-
-# Copyright (C) 1999-2012 Free Software Foundation, Inc.
-# Written by Tom Tromey <tromey@cygnus.com>.
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# This file is maintained in Automake, please report
-# bugs to <bug-automake@gnu.org> or send patches to
-# <automake-patches@gnu.org>.
-
-nl='
-'
-
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent tools from complaining about whitespace usage.
-IFS=" ""	$nl"
-
-file_conv=
-
-# func_file_conv build_file lazy
-# Convert a $build file to $host form and store it in $file
-# Currently only supports Windows hosts. If the determined conversion
-# type is listed in (the comma separated) LAZY, no conversion will
-# take place.
-func_file_conv ()
-{
-  file=$1
-  case $file in
-    / | /[!/]*) # absolute file, and not a UNC file
-      if test -z "$file_conv"; then
-	# lazily determine how to convert abs files
-	case `uname -s` in
-	  MINGW*)
-	    file_conv=mingw
-	    ;;
-	  CYGWIN*)
-	    file_conv=cygwin
-	    ;;
-	  *)
-	    file_conv=wine
-	    ;;
-	esac
-      fi
-      case $file_conv/,$2, in
-	*,$file_conv,*)
-	  ;;
-	mingw/*)
-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
-	  ;;
-	cygwin/*)
-	  file=`cygpath -m "$file" || echo "$file"`
-	  ;;
-	wine/*)
-	  file=`winepath -w "$file" || echo "$file"`
-	  ;;
-      esac
-      ;;
-  esac
-}
-
-# func_cl_dashL linkdir
-# Make cl look for libraries in LINKDIR
-func_cl_dashL ()
-{
-  func_file_conv "$1"
-  if test -z "$lib_path"; then
-    lib_path=$file
-  else
-    lib_path="$lib_path;$file"
-  fi
-  linker_opts="$linker_opts -LIBPATH:$file"
-}
-
-# func_cl_dashl library
-# Do a library search-path lookup for cl
-func_cl_dashl ()
-{
-  lib=$1
-  found=no
-  save_IFS=$IFS
-  IFS=';'
-  for dir in $lib_path $LIB
-  do
-    IFS=$save_IFS
-    if $shared && test -f "$dir/$lib.dll.lib"; then
-      found=yes
-      lib=$dir/$lib.dll.lib
-      break
-    fi
-    if test -f "$dir/$lib.lib"; then
-      found=yes
-      lib=$dir/$lib.lib
-      break
-    fi
-  done
-  IFS=$save_IFS
-
-  if test "$found" != yes; then
-    lib=$lib.lib
-  fi
-}
-
-# func_cl_wrapper cl arg...
-# Adjust compile command to suit cl
-func_cl_wrapper ()
-{
-  # Assume a capable shell
-  lib_path=
-  shared=:
-  linker_opts=
-  for arg
-  do
-    if test -n "$eat"; then
-      eat=
-    else
-      case $1 in
-	-o)
-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
-	  eat=1
-	  case $2 in
-	    *.o | *.[oO][bB][jJ])
-	      func_file_conv "$2"
-	      set x "$@" -Fo"$file"
-	      shift
-	      ;;
-	    *)
-	      func_file_conv "$2"
-	      set x "$@" -Fe"$file"
-	      shift
-	      ;;
-	  esac
-	  ;;
-	-I)
-	  eat=1
-	  func_file_conv "$2" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-I*)
-	  func_file_conv "${1#-I}" mingw
-	  set x "$@" -I"$file"
-	  shift
-	  ;;
-	-l)
-	  eat=1
-	  func_cl_dashl "$2"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-l*)
-	  func_cl_dashl "${1#-l}"
-	  set x "$@" "$lib"
-	  shift
-	  ;;
-	-L)
-	  eat=1
-	  func_cl_dashL "$2"
-	  ;;
-	-L*)
-	  func_cl_dashL "${1#-L}"
-	  ;;
-	-static)
-	  shared=false
-	  ;;
-	-Wl,*)
-	  arg=${1#-Wl,}
-	  save_ifs="$IFS"; IFS=','
-	  for flag in $arg; do
-	    IFS="$save_ifs"
-	    linker_opts="$linker_opts $flag"
-	  done
-	  IFS="$save_ifs"
-	  ;;
-	-Xlinker)
-	  eat=1
-	  linker_opts="$linker_opts $2"
-	  ;;
-	-*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
-	  func_file_conv "$1"
-	  set x "$@" -Tp"$file"
-	  shift
-	  ;;
-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
-	  func_file_conv "$1" mingw
-	  set x "$@" "$file"
-	  shift
-	  ;;
-	*)
-	  set x "$@" "$1"
-	  shift
-	  ;;
-      esac
-    fi
-    shift
-  done
-  if test -n "$linker_opts"; then
-    linker_opts="-link$linker_opts"
-  fi
-  exec "$@" $linker_opts
-  exit 1
-}
-
-eat=
-
-case $1 in
-  '')
-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: compile [--help] [--version] PROGRAM [ARGS]
-
-Wrapper for compilers which do not understand '-c -o'.
-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
-arguments, and rename the output as expected.
-
-If you are trying to build a whole package this is not the
-right script to run: please start by reading the file 'INSTALL'.
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "compile $scriptversion"
-    exit $?
-    ;;
-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
-    func_cl_wrapper "$@"      # Doesn't return...
-    ;;
-esac
-
-ofile=
-cfile=
-
-for arg
-do
-  if test -n "$eat"; then
-    eat=
-  else
-    case $1 in
-      -o)
-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
-	# So we strip '-o arg' only if arg is an object.
-	eat=1
-	case $2 in
-	  *.o | *.obj)
-	    ofile=$2
-	    ;;
-	  *)
-	    set x "$@" -o "$2"
-	    shift
-	    ;;
-	esac
-	;;
-      *.c)
-	cfile=$1
-	set x "$@" "$1"
-	shift
-	;;
-      *)
-	set x "$@" "$1"
-	shift
-	;;
-    esac
-  fi
-  shift
-done
-
-if test -z "$ofile" || test -z "$cfile"; then
-  # If no '-o' option was seen then we might have been invoked from a
-  # pattern rule where we don't need one.  That is ok -- this is a
-  # normal compilation that the losing compiler can handle.  If no
-  # '.c' file was seen then we are probably linking.  That is also
-  # ok.
-  exec "$@"
-fi
-
-# Name of file we expect compiler to create.
-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
-
-# Create the lock directory.
-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
-# that we are using for the .o file.  Also, base the name on the expected
-# object file name, since that is what matters with a parallel build.
-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
-while true; do
-  if mkdir "$lockdir" >/dev/null 2>&1; then
-    break
-  fi
-  sleep 1
-done
-# FIXME: race condition here if user kills between mkdir and trap.
-trap "rmdir '$lockdir'; exit 1" 1 2 15
-
-# Run the compile.
-"$@"
-ret=$?
-
-if test -f "$cofile"; then
-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
-elif test -f "${cofile}bj"; then
-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
-fi
-
-rmdir "$lockdir"
-exit $ret
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:

+ 3 - 0
configure.ac

@@ -171,6 +171,9 @@ fi
 # Some systems do not define strerror_r
 AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
 
+# Some systems may not define setenv
+AC_CHECK_FUNC([setenv], [AC_DEFINE([STARPU_HAVE_SETENV], [1], [Define to 1 if the function setenv is available.])])
+
 # Some systems do not define unsetenv
 AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
 

+ 1 - 1
doc/Makefile.am

@@ -60,7 +60,7 @@ uninstall-local:
 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
 
 # Rule to update documentation on web server. Should only be used locally.
-PUBLISHHOST	= sync
+PUBLISHHOST	?= sync
 update-web: starpu.html
 	sed -i 's/gcc\.html#Attribute-Syntax/http:\/\/gcc.gnu.org\/onlinedocs\/gcc\/Attribute-Syntax.html#Attribute-Syntax/' starpu.html
 	scp starpu.pdf starpu.html $(PUBLISHHOST):/web/runtime/html/StarPU

+ 53 - 2
doc/chapters/advanced-api.texi

@@ -11,9 +11,10 @@
 * Multiformat Data Interface::  
 * Task Bundles::                
 * Task Lists::                  
-* Using Parallel Tasks::       
-* Scheduling Contexts::
+* Using Parallel Tasks::        
+* Scheduling Contexts::         
 * Defining a new scheduling policy::  
+* Running drivers::             
 * Expert mode::                 
 @end menu
 
@@ -792,6 +793,56 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 @end smallexample
 @end cartouche
 
+@node Running drivers
+@section Running drivers
+
+@menu
+* Driver API::
+* Running drivers Example::
+@end menu
+
+@node Driver API
+@subsection Driver API
+
+@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
+Initialize the given driver. Returns 0 on success, -EINVAL if d->type is not
+STARPU_CUDA_WORKER.
+@end deftypefun
+
+@deftypefun int starpu_driver_run_once (struct starpu_driver *@var{d})
+Runs the driver for a while, then returns 0 on success, -EINVAL if d->type is
+not STARPU_CUDA_WORKER.
+@end deftypefun
+
+@deftypefun int starpu_driver_deinit (struct starpu_driver *@var{d})
+Deinitialize the given driver. Returns 0 on success, -EINVAL if d->type is not
+STARPU_CUDA_WORKER.
+@end deftypefun
+
+@node Running drivers Example
+@subsection Example
+
+@cartouche
+@smallexample
+int ret;
+struct starpu_driver = @{
+    .type = STARPU_CUDA_WORKER,
+    .id.cuda_id = 0
+@};
+ret = starpu_driver_init(&d);
+if (ret != 0)
+    error();
+while (some_condition) @{
+    ret = starpu_driver_run_once(&d);
+    if (ret != 0)
+        error();
+@}
+ret = starpu_driver_deinit(&d);
+if (ret != 0)
+    error();
+@end smallexample
+@end cartouche
+
 @node Expert mode
 @section Expert mode
 

+ 0 - 7
examples/Makefile.am

@@ -196,7 +196,6 @@ examplebin_PROGRAMS +=				\
 	scheduler/dummy_sched			\
 	reductions/dot_product			\
 	reductions/minmax_reduction		\
-	mandelbrot/mandelbrot			\
 	ppm_downscaler/ppm_downscaler		\
 	ppm_downscaler/yuv_downscaler
 
@@ -780,12 +779,6 @@ endif
 # Mandelbrot Set #
 ##################
 
-mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
-if HAVE_X11
-mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
-mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
-endif
-
 ################
 # Top Examples #
 ################

+ 2 - 2
examples/lu/xlu_pivot.c

@@ -380,7 +380,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 	}
 #endif
 
-	double timing;
+	double timing=0.0;
 	int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing);
 
 	FPRINTF(stderr, "Computation took (in ms)\n");
@@ -435,7 +435,7 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 		piv_description[block].last = (block + 1) * (size / nblocks);
 	}
 
-	double timing;
+	double timing=0.0;
 	int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing);
 
 	FPRINTF(stderr, "Computation took (in ms)\n");

+ 12 - 25
gcc-plugin/tests/Makefile.am

@@ -15,20 +15,31 @@
 
 
 gcc_tests =					\
+  base.c					\
+  pointers.c					\
+  output-pointer.c				\
   output-pointer-errors.c			\
+  register.c					\
   register-errors.c				\
+  acquire.c					\
   acquire-errors.c				\
+  release.c					\
   release-errors.c				\
+  unregister.c					\
   unregister-errors.c				\
   task-errors.c					\
   scalar-tasks.c				\
   pointer-tasks.c				\
   external-task-impl.c				\
   no-initialize.c				\
+  lib-user.c					\
   wait-errors.c					\
+  heap-allocated.c				\
   heap-allocated-errors.c			\
   verbose.c					\
   debug-tree.c					\
+  opencl.c					\
+  opencl-errors.c				\
   shutdown-errors.c
 
 EXTRA_DIST =
@@ -42,35 +53,11 @@ gcc_tests += opencl-types.c
 # This test simulates errors when lacking an OpenCL implementation.
 gcc_tests += opencl-lacking.c
 
-gcc_tests +=  					\
-  base.c 					\
-  pointers.c 					\
-  output-pointer.c				\
-  register.c					\
-  acquire.c					\
-  release.c					\
-  unregister.c					\
-  lib-user.c					\
-  heap-allocated.c				\
-  opencl.c					\
-  opencl-errors.c
-
 else STARPU_USE_OPENCL
 
 EXTRA_DIST +=					\
-  base.c					\
-  pointers.c					\
   opencl-types.c				\
-  opencl-lacking.c				\
-  output-pointer.c				\
-  register.c					\
-  acquire.c					\
-  release.c					\
-  unregister.c					\
-  lib-user.c					\
-  heap-allocated.c				\
-  opencl.c					\
-  opencl-errors.c
+  opencl-lacking.c
 
 endif STARPU_USE_OPENCL
 

+ 7 - 1
gcc-plugin/tests/mocks.h

@@ -424,13 +424,19 @@ starpu_free (void *ptr)
 
 /* OpenCL support.  */
 
-#define STARPU_USE_OPENCL 1
+#ifndef STARPU_USE_OPENCL
 
+# define STARPU_USE_OPENCL 1
+
+/* The `opencl' pragma needs this structure, so make sure it's defined.  */
 struct starpu_opencl_program
 {
   /* Nothing.  */
 };
 
+#endif
+
+
 /* Number of `load_opencl_from_string' calls.  */
 static unsigned int load_opencl_calls;
 

+ 4 - 0
gcc-plugin/tests/run-test.in

@@ -73,6 +73,9 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 (define %cuda-cppflags
   (string-tokenize "@STARPU_CUDA_CPPFLAGS@"))
 
+(define %opencl-cppflags
+  (string-tokenize "@STARPU_OPENCL_CPPFLAGS@"))
+
 (define %default-cflags
   `("-I" ,%srcdir
     "-I" ,(string-append %srcdir "/../../src")    ; for <common/uthash.h>
@@ -81,6 +84,7 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
     "-I" ,(string-append %builddir "/../..")
 
     ,@%cuda-cppflags
+    ,@%opencl-cppflags
 
     ;; Unfortunately `libtool --mode=execute' doesn't help here, so hard-code
     ;; the real file name.

+ 5 - 2
socl/examples/Makefile.am

@@ -43,18 +43,21 @@ examplebin_PROGRAMS =
 
 examplebin_PROGRAMS +=		\
 	basic/basic		\
-	clinfo/clinfo
+	clinfo/clinfo \
+  matmul/matmul
 
 #	mandelbrot/mandelbrot
 
 SOCL_EXAMPLES +=		\
 	basic/basic		\
-	clinfo/clinfo
+	clinfo/clinfo\
+  matmul/matmul
 
 #	mandelbrot/mandelbrot
 
 basic_basic_SOURCES = basic/basic.c
 clinfo_clinfo_SOURCES = clinfo/clinfo.c
+matmul_matmul_SOURCES = matmul/matmul.c
 #mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
 
 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)

+ 477 - 0
socl/examples/matmul/matmul.c

@@ -0,0 +1,477 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <CL/cl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <assert.h>
+#include <math.h>
+#include <sys/time.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)
+#define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }
+
+// Thread block size
+#define BLOCK_SIZE 16  // Kernel thread-block size
+#define WORK_SIZE 64  // Kernel global size in lines of A (or C)
+#define TYPE float
+
+// Basic Matrix dimensions
+#define WA (1024L * BLOCK_SIZE) // Matrix A width
+#define HA (512L * BLOCK_SIZE) // Matrix A height
+#define WB (1024L * BLOCK_SIZE) // Matrix B width
+#define HB WA  // Matrix B height
+#define WC WB  // Matrix C width 
+#define HC HA  // Matrix C height
+#define BLOCKS (HA / WORK_SIZE)
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void printDiff(TYPE*, TYPE*, int, int, int, TYPE);
+void computeReference(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
+
+#define str(x) #x
+
+#define CODE "\
+#define TYPE float\n\
+__kernel void sgemmNN(int wa, int ha, int wb,  __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\
+#define BS 16\n\
+#define BLOCK_SIZE 16\n\
+  int bx = get_group_id(0);\n\
+  int by = get_group_id(1);\n\
+  \n\
+  int tx = get_local_id(0);\n\
+  int ty = get_local_id(1);\n\
+  \n\
+  int gx = get_global_id(0);\n\
+  int gy = get_global_id(1);\n\
+    __local float As[BS][BS+1];\
+    __local float Bs[BS][BS+1];\
+  \n\
+  unsigned int block_w = min(wb - bx * BLOCK_SIZE, BLOCK_SIZE);\n\
+  unsigned int block_h = min(ha - by * BLOCK_SIZE, BLOCK_SIZE);\n\
+  \n\
+  int valid = (gx < wb && gy < ha);\n\
+  \n\
+  TYPE Csub = (TYPE)0.0;\n\
+  \n\
+  int pos = 0;\n\
+  while (pos < wa) {\n\
+    unsigned int size = min(wa-pos, BLOCK_SIZE);\n\
+    if (tx < size && gy < ha)\n\
+      As[tx][ty] = A[pos + tx + wa * gy];\n\
+    if (ty < size && gx < wb)\n\
+      Bs[tx][ty] = B[gx + wb * (pos+ty)];\n\
+    \n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    \n\
+    if (valid) {\n\
+      for (int k = 0; k < size; ++k)\n\
+        Csub += As[k][ty] * Bs[tx][k];\n\
+    }\n\
+    pos += size;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+  }\n\
+  \n\
+  if (valid)\n\
+    C[wb * gy + gx] = Csub;\n\
+}"
+
+static char * code =  CODE;
+
+int check = 0;
+
+static void __attribute__((unused)) parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-check") == 0)
+		{
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-h") == 0)
+		{
+			printf("usage : %s [-check]\n", argv[0]);
+		}
+	}
+}
+
+#define shrLog(...) fprintf(stderr, __VA_ARGS__);
+
+// Round Up Division function
+size_t shrRoundUp(int group_size, int global_size) {
+	int r = global_size % group_size;
+	if(r == 0) {
+		return global_size;
+	} else {
+		return global_size + group_size - r;
+	}
+}
+
+void fillArray(TYPE* pfData, int iSize) {
+	int i;
+	const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
+	for (i = 0; i < iSize; ++i) {
+		pfData[i] = fScale * rand();
+	}
+}
+
+void shrPrintArray(float* pfData, int iSize) {
+	int i;
+	for (i = 0; i < iSize; ++i) {
+		shrLog("%d: %.3f\n", i, pfData[i]);
+	}
+}
+
+/**
+ * Compare two float arrays using L2-norm with an epsilon tolerance for equality
+ * @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE
+ * @param reference  handle to the reference data / gold image
+ * @param data       handle to the computed data
+ * @param len        number of elements in reference and data
+ * @param epsilon    epsilon to use for the comparison
+*/
+int shrCompareL2fe( const float* reference, const float* data, const unsigned int len, const float epsilon ) {
+	assert(epsilon >= 0);
+
+	float error = 0;
+	float ref = 0;
+
+	unsigned int i;
+	for(i = 0; i < len; ++i) {
+		float diff = reference[i] - data[i];
+		error += diff * diff;
+		ref += reference[i] * reference[i];
+	}
+
+	float normRef = sqrtf(ref);
+	if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+		fprintf(stderr, "ERROR, reference l2-norm is 0\n");
+#endif
+		return 0;
+	}
+	float normError = sqrtf(error);
+	error = normError / normRef;
+	int result = error < epsilon;
+#ifdef _DEBUG
+	if( !result) {
+		fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);
+	}
+#endif
+
+	return result;
+}
+
+
+int main(int argc, const char** argv) {
+	cl_uint platform_count;
+	cl_platform_id platforms[5];
+
+	cl_int err = CL_SUCCESS;
+	unsigned int i, p;
+
+	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
+
+	void * ptrs[BLOCKS];
+	cl_mem d_A[BLOCKS];
+	cl_mem d_C[BLOCKS];
+	cl_mem d_B[BLOCKS];
+
+	cl_event GPUDone[BLOCKS];
+	cl_event GPUExecution[BLOCKS];
+	struct timeval start, end;
+
+	int workOffset[BLOCKS];
+	int workSize[BLOCKS];
+
+	unsigned int sizePerGPU = HC / BLOCKS;
+	unsigned int sizeMod = HC % BLOCKS;
+
+	size_t A_size = WA * HA;
+	size_t A_mem_size = sizeof(TYPE) * A_size;
+	TYPE* A_data;
+
+	size_t B_size = WB * HB;
+	size_t B_mem_size = sizeof(TYPE) * B_size;
+	TYPE* B_data;
+
+	size_t C_size = WC * HC;
+	size_t C_mem_size = sizeof(TYPE) * C_size;
+	TYPE* C_data;
+
+	parse_args(argc, argv);
+
+	check(clGetPlatformIDs(5, platforms, &platform_count));
+	if (platform_count == 0)
+		error("No platform found\n");
+
+	cl_uint device_count;
+	cl_uint devs[platform_count];
+	cl_device_id * devices[platform_count];
+	cl_context ctx[platform_count];
+	cl_command_queue * commandQueue[platform_count];
+
+	device_count = 0;
+	for (p=0; p<platform_count; p++) {
+		cl_platform_id platform = platforms[p];
+
+		cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
+		if (err == CL_DEVICE_NOT_FOUND) {
+			devs[p] = 0;
+			continue;
+		}
+		check(err);
+		if (devs[p] == 0)
+			continue;
+
+		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
+		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
+
+		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
+
+		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
+		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
+
+		for(i = 0; i < devs[p]; ++i) 
+		{
+			cl_device_id device = devices[p][i];
+			char name[2048];
+			name[0] = '\0';
+			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
+			printf("Device %d: %s\n", i, name);
+
+			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
+		}
+
+		device_count += devs[p];
+	}
+
+	if (device_count == 0)
+		error("No device found\n");
+
+
+
+	cl_kernel multiplicationKernel[platform_count];
+
+	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", 
+			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
+
+	// allocate host memory for matrices A, B and C
+	A_data = (TYPE*)malloc(A_mem_size);
+	if (A_data == NULL) {
+		perror("malloc");
+	}
+
+	B_data = (TYPE*)malloc(B_mem_size);
+	if (B_data == NULL) {
+		perror("malloc");
+	}
+
+	C_data = (TYPE*) malloc(C_mem_size);
+	if (C_data == NULL) {
+		perror("malloc");
+	}
+
+	cl_program program[platform_count];
+
+	for (p=0; p<platform_count; p++) {
+		if (devs[p] == 0)
+			continue;
+
+		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
+
+		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
+
+		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
+	}
+
+	printf("Initializing data...\n");
+	srand(2008);
+	fillArray(A_data, A_size);
+	fillArray(B_data, B_size);
+	memset(C_data, 0, C_size);
+
+
+	printf("Computing...\n");
+	workOffset[0] = 0;
+	gettimeofday(&start, NULL);
+
+	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
+	int c = 0;
+	for (p=0; p<platform_count;p++) {
+		for (i=0; i<devs[p]; i++) {
+			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
+			c++;
+		}
+	}
+
+	for(i=0; i < BLOCKS; ++i) 
+	{
+		int d = i % device_count;
+		cl_uint p = 0;
+
+		// determine device platform
+		int dev = d;
+		for (p = 0; p < platform_count; p++) {
+			if ((cl_int)(dev - devs[p]) < 0)
+				break;
+			dev -= devs[p];
+		}
+
+		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;        
+
+		check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
+		check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
+
+		check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));
+		check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
+		check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
+
+		size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};
+
+		check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
+
+		// Non-blocking copy of result from device to host
+		check2(ptrs[i] = clEnqueueMapBuffer(commandQueue[p][dev], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
+
+		if(i+1 < BLOCKS)
+			workOffset[i + 1] = workOffset[i] + workSize[i];
+	}
+
+
+	// CPU sync with GPU
+	for (p=0; p<platform_count;p++) {
+		cl_uint dev;
+		for (dev=0; dev<devs[p]; dev++) {
+			clFinish(commandQueue[p][dev]);
+		}
+	}
+
+	gettimeofday(&end, NULL);
+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	double dSeconds = timing/1000/1000;
+	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
+	double gflops = 1.0e-9 * dNumOps/dSeconds;
+
+	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", 
+			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
+
+	for (i=0; i<device_count; i++) {
+		clReleaseMemObject(d_B[i]);
+	}
+
+	for(i = 0; i < BLOCKS; i++) 
+	{
+		clReleaseMemObject(d_A[i]);
+		clReleaseMemObject(d_C[i]);
+		clReleaseEvent(GPUExecution[i]);
+		clReleaseEvent(GPUDone[i]);
+	}
+
+
+	// compute reference solution
+	if (check) {
+		printf("Comparing results with CPU computation... ");
+		TYPE* reference = (TYPE*)malloc(C_mem_size);
+		computeReference(reference, A_data, B_data, HA, WA, WB);
+
+		// check result
+		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
+		if (res == 0) {
+			printf("\n\n");
+			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
+		}
+		else printf("PASSED\n\n");
+		free(reference);
+	}
+
+	for (p=0; p<platform_count;p++) {
+		if (devs[p] == 0)
+			continue;
+
+		check(clReleaseKernel(multiplicationKernel[p]));
+		check(clReleaseProgram(program[p]));
+		check(clReleaseContext(ctx[p]));
+		cl_uint k;
+		for(k = 0; k < devs[p]; ++k) 
+		{
+			check(clReleaseCommandQueue(commandQueue[p][k]));
+		}
+	}
+
+	free(A_data);
+	free(B_data);
+	free(C_data);
+
+	return 0;
+}
+
+void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol) {
+	shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
+	int i,j,k;
+	int error_count=0;
+	for (j = 0; j < height; j++) {
+		if (error_count < iListLength) {
+			shrLog("\n  Row %d:\n", j);
+		}
+		for (i = 0; i < width; i++) {
+			k = j * width + i;
+			float fDiff = fabs(data1[k] - data2[k]);
+			if (fDiff > fListTol) {                
+				if (error_count < iListLength) {
+					shrLog("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
+				}
+				error_count++;
+			}
+		}
+	}
+	shrLog(" \n  Total Errors = %d\n\n", error_count);
+}
+
+/**
+ * Compute reference data set
+ * C = A * B
+ * @param C          reference data, computed but preallocated
+ * @param A          matrix A as provided to device
+ * @param B          matrix B as provided to device
+ * @param hA         height of matrix A
+ * @param wB         width of matrix B
+*/
+void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB) {
+	unsigned int i,j,k;
+	for (i = 0; i < hA; ++i)
+		for (j = 0; j < wB; ++j) {
+			double sum = 0;
+			for (k = 0; k < wA; ++k) {
+				double a = A[i * wA + k];
+				double b = B[k * wB + j];
+				sum += a * b;
+			}
+			C[i * wB + j] = (TYPE)sum;
+		}
+}
+

+ 0 - 3
socl/src/init.c

@@ -28,9 +28,6 @@ __attribute__((constructor)) static void socl_init() {
   struct starpu_conf conf;
   starpu_conf_init(&conf);
   conf.ncuda = 0;
-  putenv("STARPU_NCUDA=0");
-  putenv("STARPU_NOPENCL=1");
-  putenv("STARPU_NCPUS=1");
 
   mem_object_init();
 

+ 12 - 16
src/core/dependencies/data_concurrency.c

@@ -260,8 +260,11 @@ static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 		return 0;
 }
 
-/* The header lock must already be taken by the caller */
-void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
+/* The header lock must already be taken by the caller.
+ * This may free the handle if it was lazily unregistered (1 is returned in
+ * that case). The handle pointer thus becomes invalid for the caller.
+ */
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 {
 	_starpu_spin_checklocked(&handle->header_lock);
 	/* A data access has finished so we remove a reference. */
@@ -269,19 +272,9 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 	handle->refcnt--;
 	STARPU_ASSERT(handle->busy_count > 0);
 	handle->busy_count--;
-	_starpu_data_check_not_busy(handle);
-
-	/* The handle has been destroyed in between (eg. this was a temporary
-	 * handle created for a reduction.) */
-	if (handle->lazy_unregister && handle->refcnt == 0)
-	{
-		_starpu_spin_unlock(&handle->header_lock);
-		starpu_data_unregister_no_coherency(handle);
-		/* Warning: in case we unregister the handle, we must be sure
-		 * that the caller will not try to unlock the header after
-		 * !*/
-		return;
-	}
+	if (_starpu_data_check_not_busy(handle))
+		/* Handle was destroyed, nothing left to do.  */
+		return 1;
 
 	/* In case there is a pending reduction, and that this is the last
 	 * requester, we may go back to a "normal" coherency model. */
@@ -358,7 +351,10 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 			_starpu_spin_lock(&handle->header_lock);
 			STARPU_ASSERT(handle->busy_count > 0);
 			handle->busy_count--;
-			_starpu_data_check_not_busy(handle);
+			if (_starpu_data_check_not_busy(handle))
+				return 1;
 		}
 	}
+
+	return 0;
 }

+ 2 - 2
src/core/dependencies/data_concurrency.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -22,7 +22,7 @@
 
 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
 
-void _starpu_notify_data_dependencies(starpu_data_handle_t handle);
+int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
 
 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
 							  enum starpu_access_mode mode,

+ 35 - 0
src/core/dependencies/implicit_data_deps.c

@@ -405,6 +405,41 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
 }
 
+/* This is the same as _starpu_release_data_enforce_sequential_consistency, but
+ * for all data of a task */
+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
+{
+	struct starpu_task *task = j->task;
+        struct starpu_buffer_descr *descrs = j->ordered_buffers;
+
+	if (!task->cl)
+		return;
+
+        unsigned nbuffers = task->cl->nbuffers;
+
+	unsigned index;
+	for (index = 0; index < nbuffers; index++)
+	{
+		starpu_data_handle_t handle = descrs[index].handle;
+
+		if (index && descrs[index-1].handle == descrs[index].handle)
+			/* We have already released this data, skip it. This
+			 * depends on ordering putting writes before reads, see
+			 * _starpu_compar_handles */
+			continue;
+
+		_starpu_release_data_enforce_sequential_consistency(task, handle);
+		/* Release the reference acquired in _starpu_push_task_output */
+		_starpu_spin_lock(&handle->header_lock);
+		STARPU_ASSERT(handle->busy_count > 0);
+		handle->busy_count--;
+		if (!_starpu_data_check_not_busy(handle))
+			_starpu_spin_unlock(&handle->header_lock);
+
+	}
+}
+
+
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle)
 {
         _STARPU_LOG_IN();

+ 1 - 0
src/core/dependencies/implicit_data_deps.h

@@ -25,6 +25,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 						   starpu_data_handle_t handle, enum starpu_access_mode mode);
 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
 void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);

+ 3 - 0
src/core/jobs.c

@@ -163,6 +163,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 
 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
 
+	/* Tell other tasks that we don't exist any more, thus no need for
+	 * implicit dependencies any more.  */
+	_starpu_release_task_enforce_sequential_consistency(j);
 	/* Task does not have a cl, but has explicit data dependencies, we need
 	 * to tell them that we will not exist any more before notifying the
 	 * tasks waiting for us */

+ 2 - 2
src/core/sched_policy.c

@@ -46,7 +46,7 @@ extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
 extern struct starpu_sched_policy _starpu_sched_eager_policy;
 extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
 extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
-extern struct starpu_sched_policy heft_policy;
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
 
 static struct starpu_sched_policy *predefined_policies[] =
 {
@@ -54,7 +54,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 	&_starpu_sched_prio_policy,
 	&_starpu_sched_dm_policy,
 	&_starpu_sched_dmda_policy,
-	&heft_policy,
+	&_starpu_sched_heft_policy,
 	&_starpu_sched_dmda_ready_policy,
 	&_starpu_sched_dmda_sorted_policy,
 	&_starpu_sched_random_policy,

+ 6 - 16
src/datawizard/coherency.c

@@ -559,16 +559,8 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
 	STARPU_ASSERT(handle->busy_count > 0);
 	handle->busy_count--;
-	_starpu_data_check_not_busy(handle);
 
-	/* In case there was a temporary handle (eg. used for reduction), this
-	 * handle may have requested to be destroyed when the data is released
-	 * */
-	unsigned handle_was_destroyed = handle->lazy_unregister;
-
-	_starpu_notify_data_dependencies(handle);
-
-	if (!handle_was_destroyed)
+	if (!_starpu_notify_data_dependencies(handle))
 		_starpu_spin_unlock(&handle->header_lock);
 }
 
@@ -723,15 +715,13 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 
 		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
 
-		/* In case there was a temporary handle (eg. used for
-		 * reduction), this handle may have requested to be destroyed
-		 * when the data is released
-		 * */
-		unsigned handle_was_destroyed = handle->lazy_unregister;
+		/* Keep a reference for future
+		 * _starpu_release_task_enforce_sequential_consistency call */
+		_starpu_spin_lock(&handle->header_lock);
+		handle->busy_count++;
+		_starpu_spin_unlock(&handle->header_lock);
 
 		_starpu_release_data_on_node(handle, mask, local_replicate);
-		if (!handle_was_destroyed)
-			_starpu_release_data_enforce_sequential_consistency(task, handle);
 	}
 
 	if (profiling && task->profiling_info)

+ 1 - 1
src/datawizard/coherency.h

@@ -110,7 +110,7 @@ struct _starpu_data_state
 	struct _starpu_spinlock header_lock;
 
 	/* Condition to make application wait for all transfers before freeing handle */
-	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, and number of starpu_data_requesters */
+	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, number of starpu_data_requesters, and number of tasks that have released it but are still registered on the implicit data dependency lists. */
 	/* Core code which releases busy_count has to call
 	 * _starpu_data_check_not_busy to let starpu_data_unregister proceed */
 	unsigned busy_count;

+ 3 - 2
src/datawizard/data_request.c

@@ -296,7 +296,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 		handle->busy_count--;
 	}
 
-	_starpu_data_check_not_busy(handle);
+	unsigned destroyed = _starpu_data_check_not_busy(handle);
 
 	r->refcnt--;
 
@@ -314,7 +314,8 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	if (do_delete)
 		starpu_data_request_destroy(r);
 
-	_starpu_spin_unlock(&handle->header_lock);
+	if (!destroyed)
+		_starpu_spin_unlock(&handle->header_lock);
 
 	/* We do the callback once the lock is released so that they can do
 	 * blocking operations with the handle (eg. release it) */

+ 26 - 7
src/datawizard/interfaces/data_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -405,8 +405,11 @@ struct _starpu_unregister_callback_arg
 
 /* Check whether we should tell starpu_data_unregister that the data handle is
  * not busy any more.
- * The header is supposed to be locked */
-void _starpu_data_check_not_busy(starpu_data_handle_t handle)
+ * The header is supposed to be locked.
+ * This may free the handle, if it was lazily unregistered (1 is returned in
+ * that case).  The handle pointer thus becomes invalid for the caller.
+ */
+int _starpu_data_check_not_busy(starpu_data_handle_t handle)
 {
 	if (!handle->busy_count && handle->busy_waiting)
 	{
@@ -414,6 +417,20 @@ void _starpu_data_check_not_busy(starpu_data_handle_t handle)
 		_STARPU_PTHREAD_COND_BROADCAST(&handle->busy_cond);
 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
 	}
+
+	/* The handle has been destroyed in between (eg. this was a temporary
+	 * handle created for a reduction.) */
+	if (handle->lazy_unregister && handle->busy_count == 0)
+	{
+		_starpu_spin_unlock(&handle->header_lock);
+		starpu_data_unregister_no_coherency(handle);
+		/* Warning: in case we unregister the handle, we must be sure
+		 * that the caller will not try to unlock the header after
+		 * !*/
+		return 1;
+	}
+
+	return 0;
 }
 
 static void _starpu_data_unregister_fetch_data_callback(void *_arg)
@@ -519,14 +536,16 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			func(buffers, NULL);
 		}
 	}
-	else
-	{
+
+	_starpu_spin_lock(&handle->header_lock);
+	if (!coherent) {
 		/* Should we postpone the unregister operation ? */
-		if ((handle->refcnt > 0) && handle->lazy_unregister)
+		if ((handle->busy_count > 0) && handle->lazy_unregister) {
+			_starpu_spin_unlock(&handle->header_lock);
 			return;
+		}
 	}
 
-	_starpu_spin_lock(&handle->header_lock);
 	/* Tell holders of references that we're starting waiting */
 	handle->busy_waiting = 1;
 	_starpu_spin_unlock(&handle->header_lock);

+ 2 - 2
src/datawizard/interfaces/data_interface.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,7 +27,7 @@ void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	STARPU_ATTRIBUTE_INTERNAL;
 
 extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
-extern void _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
+extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,

+ 2 - 0
src/datawizard/reduction.c

@@ -342,7 +342,9 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 		if (handle->reduction_tmp_handles[worker])
 		{
 //			fprintf(stderr, "unregister handle %p\n", handle);
+			_starpu_spin_lock(&handle->reduction_tmp_handles[worker]->header_lock);
 			handle->reduction_tmp_handles[worker]->lazy_unregister = 1;
+			_starpu_spin_unlock(&handle->reduction_tmp_handles[worker]->header_lock);
 			starpu_data_unregister_no_coherency(handle->reduction_tmp_handles[worker]);
 			handle->per_worker[worker].refcnt--;
 			/* TODO put in cache */

+ 3 - 8
src/datawizard/user_interactions.c

@@ -334,8 +334,8 @@ static void _prefetch_data_on_node(void *arg)
 	}
 
 	_starpu_spin_lock(&handle->header_lock);
-	_starpu_notify_data_dependencies(handle);
-	_starpu_spin_unlock(&handle->header_lock);
+	if (!_starpu_notify_data_dependencies(handle))
+		_starpu_spin_unlock(&handle->header_lock);
 }
 
 static
@@ -376,17 +376,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 			STARPU_ASSERT(replicate->refcnt >= 0);
 			STARPU_ASSERT(handle->busy_count > 0);
 			handle->busy_count--;
-			_starpu_data_check_not_busy(handle);
 		}
 
 		/* In case there was a temporary handle (eg. used for reduction), this
 		 * handle may have requested to be destroyed when the data is released
 		 * */
-		unsigned handle_was_destroyed = handle->lazy_unregister;
-
-		_starpu_notify_data_dependencies(handle);
-
-		if (!handle_was_destroyed)
+		if (!_starpu_notify_data_dependencies(handle))
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 	else if (!async)

+ 2 - 2
src/datawizard/write_back.c

@@ -24,8 +24,8 @@ static void wt_callback(void *arg)
 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
 
 	_starpu_spin_lock(&handle->header_lock);
-	_starpu_notify_data_dependencies(handle);
-	_starpu_spin_unlock(&handle->header_lock);
+	if (!_starpu_notify_data_dependencies(handle))
+		_starpu_spin_unlock(&handle->header_lock);
 }
 
 void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,

+ 1 - 1
src/sched_policies/heft.c

@@ -641,7 +641,7 @@ static void heft_deinit(unsigned sched_ctx_id)
 	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
 }
 
-struct starpu_sched_policy heft_policy = 
+struct starpu_sched_policy _starpu_sched_heft_policy =
 {
 	.init_sched = heft_init,
 	.deinit_sched = heft_deinit,

+ 7 - 1
tests/Makefile.am

@@ -215,7 +215,11 @@ noinst_PROGRAMS =				\
 	parallel_tasks/parallel_kernels		\
 	parallel_tasks/parallel_kernels_spmd	\
 	perfmodels/regression_based		\
-	perfmodels/non_linear_regression_based 
+	perfmodels/non_linear_regression_based  \
+	sched_policies/data_locality            \
+	sched_policies/execute_all_tasks        \
+	sched_policies/simple_deps              \
+	sched_policies/simple_cpu_gpu_sched
 
 if STARPU_HAVE_WINDOWS
 check_PROGRAMS = $(noinst_PROGRAMS)
@@ -543,5 +547,7 @@ perfmodels_non_linear_regression_based_SOURCES+=\
 	perfmodels/opencl_memset.c
 endif
 
+sched_policies_execute_all_tasks_LDFLAGS = -lm
+
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 9 - 0
tests/datawizard/gpu_register.c

@@ -21,6 +21,13 @@
 #include "../helper.h"
 #include "scal.h"
 
+#if ! (defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA))
+int main(int argc, char **argv)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+
 static int
 submit_tasks(starpu_data_handle_t handle, int pieces, int n)
 {
@@ -292,3 +299,5 @@ fail:
 	starpu_shutdown();
 	return EXIT_FAILURE;
 }
+
+#endif /* defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) */

+ 2 - 0
tests/datawizard/readonly.c

@@ -21,10 +21,12 @@
 #endif
 #include "../helper.h"
 
+#ifdef STARPU_USE_OPENCL
 static void codelet(void *descr[], __attribute__ ((unused)) void *_args)
 {
      FPRINTF(stderr, "codelet\n");
 }
+#endif
 
 static struct starpu_codelet cl =
 {

+ 193 - 0
tests/sched_policies/data_locality.c

@@ -0,0 +1,193 @@
+#include <starpu.h>
+#include <starpu_profiling.h>
+
+#include "../helper.h"
+
+#define NTASKS 8
+
+/*
+ * It is very inefficient to keep moving data between memory nodes. This
+ * test makes sure the scheduler will take account of the data locality
+ * when scheduling tasks.
+ *
+ * Applies to : dmda, heft, pheft.
+ */
+
+static void
+dummy(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+}
+
+/* 
+ * Dummy cost function, used to make sure the scheduler does schedule the
+ * task, instead of getting rid of it as soon as possible because it doesn't
+ * know its expected length.
+ */
+static double
+cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.0;
+}
+
+static struct starpu_perfmodel model =
+{
+	.type          = STARPU_COMMON,
+	.cost_function = cost_function
+};
+
+static struct starpu_codelet cl =
+{
+	.cpu_funcs     = { dummy, NULL },
+	.cuda_funcs    = { dummy, NULL },
+	.opencl_funcs  = { dummy, NULL },
+	.modes         = { STARPU_RW },
+	.model         = &model,
+	.nbuffers      = 1
+};
+
+static int var = 42;
+static starpu_data_handle_t rw_handle;
+
+static void
+init_data(void)
+{
+	starpu_variable_data_register(&rw_handle, 0, (uintptr_t) &var,
+					sizeof(var));
+}
+
+static void
+free_data(void)
+{
+	starpu_data_unregister(rw_handle);
+}
+
+static int
+run(struct starpu_sched_policy *policy)
+{
+	int ret;
+	struct starpu_conf conf;
+
+	starpu_conf_init(&conf);
+	conf.sched_policy = policy;
+
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		goto enodev;
+
+	if (starpu_cpu_worker_get_count() == 0 ||
+	    (starpu_cuda_worker_get_count() == 0 && 
+	     starpu_opencl_worker_get_count() == 0))
+		goto enodev;
+
+	starpu_profiling_status_set(1);
+	init_data();
+
+	/* Send the handle to a GPU. */
+	cl.where = STARPU_CUDA | STARPU_OPENCL;
+	struct starpu_task *tasks[NTASKS];
+	tasks[0] = starpu_task_create();
+	tasks[0]->cl = &cl;
+	tasks[0]->synchronous = 1;
+	tasks[0]->handles[0] = rw_handle;
+	tasks[0]->destroy = 0;
+	ret = starpu_task_submit(tasks[0]);
+	if (ret == -ENODEV)
+		goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	
+	/* Now, run multiple tasks using this handle. */
+	cl.where |= STARPU_CPU;
+	int i;
+	for (i = 1; i < NTASKS; i++)
+	{
+		tasks[i] = starpu_task_create();
+		tasks[i]->cl = &cl;
+		tasks[i]->handles[0] = rw_handle;
+		tasks[i]->destroy = 0;
+		ret = starpu_task_submit(tasks[i]);
+		if (ret == -ENODEV)
+			goto enodev;
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+	starpu_task_wait_for_all();
+
+	/* All tasks should have been executed on the same GPU. */
+	ret = 0;
+	unsigned workerid = tasks[0]->profiling_info->workerid;
+	for (i = 0; i < NTASKS; i++)
+	{
+		if (tasks[i]->profiling_info->workerid != workerid)
+		{
+			ret = 1;
+			break;
+		}
+		starpu_task_destroy(tasks[i]);
+	}
+
+	/* Clean everything up. */
+	for (; i < NTASKS; i++)
+		starpu_task_destroy(tasks[i]);
+
+	free_data();
+	starpu_shutdown();
+
+	return ret;
+
+enodev:
+	starpu_shutdown();
+	return -ENODEV;
+
+}
+
+/* XXX: Does this test apply to other schedulers ? */
+//extern struct starpu_sched_policy _starpu_sched_ws_policy;
+//extern struct starpu_sched_policy _starpu_sched_prio_policy;
+//extern struct starpu_sched_policy _starpu_sched_random_policy;
+//extern struct starpu_sched_policy _starpu_sched_dm_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
+//extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
+//extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
+//extern struct starpu_sched_policy _starpu_sched_eager_policy;
+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
+//extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
+
+static struct starpu_sched_policy *policies[] =
+{
+	//&_starpu_sched_ws_policy,
+	//&_starpu_sched_prio_policy,
+	//&_starpu_sched_dm_policy,
+	&_starpu_sched_dmda_policy,
+	&_starpu_sched_heft_policy,
+	//&_starpu_sched_dmda_ready_policy,
+	//&_starpu_sched_dmda_sorted_policy,
+	//&_starpu_sched_random_policy,
+	//&_starpu_sched_eager_policy,
+	&_starpu_sched_parallel_heft_policy,
+	//&_starpu_sched_pgreedy_policy
+};
+
+int
+main(void)
+{
+	int i;
+	int n_policies = sizeof(policies)/sizeof(policies[0]);
+	for (i = 0; i < n_policies; ++i)
+	{
+		struct starpu_sched_policy *policy = policies[i];
+		FPRINTF(stdout, "Running with policy %s.\n",
+			policy->policy_name);
+		int ret = run(policy);
+		if (ret == -ENODEV)
+			return STARPU_TEST_SKIPPED;
+		if (ret == 1)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}

+ 140 - 0
tests/sched_policies/execute_all_tasks.c

@@ -0,0 +1,140 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <math.h>
+#include <unistd.h>
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+
+#include "../helper.h"
+
+/*
+ * All tasks submitted by StarPU should be executed once.
+ * Applies to: all schedulers.
+ */
+
+#define NTASKS           8
+#define TASK_DURATION    1e6 /* In microseconds */
+
+extern struct starpu_sched_policy _starpu_sched_ws_policy;
+extern struct starpu_sched_policy _starpu_sched_prio_policy;
+extern struct starpu_sched_policy _starpu_sched_random_policy;
+extern struct starpu_sched_policy _starpu_sched_dm_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
+extern struct starpu_sched_policy _starpu_sched_eager_policy;
+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
+
+static struct starpu_sched_policy *policies[] =
+{
+	&_starpu_sched_ws_policy,
+	&_starpu_sched_prio_policy,
+	&_starpu_sched_dm_policy,
+	&_starpu_sched_dmda_policy,
+	&_starpu_sched_heft_policy,
+	&_starpu_sched_dmda_ready_policy,
+	&_starpu_sched_dmda_sorted_policy,
+	&_starpu_sched_random_policy,
+	&_starpu_sched_eager_policy,
+	&_starpu_sched_parallel_heft_policy,
+	&_starpu_sched_pgreedy_policy
+};
+
+static void
+dummy(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+
+	usleep(TASK_DURATION);
+}
+
+static int
+run(struct starpu_sched_policy *p)
+{
+	int ret;
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(STARPU_TEST_SKIPPED);
+
+	starpu_profiling_status_set(1);
+
+	struct starpu_task *tasks[NTASKS] = { NULL };
+	struct starpu_codelet cl = 
+	{
+		.cpu_funcs    = {dummy, NULL},
+		.cuda_funcs   = {dummy, NULL},
+		.opencl_funcs = {dummy, NULL},
+		.nbuffers     = 0
+	};
+
+	int i;
+	for (i = 0; i < NTASKS; i++)
+	{
+		struct starpu_task *task = starpu_task_create();
+		tasks[i] = task;
+		task->cl = &cl;
+		task->synchronous = 1;
+		task->destroy = 0;
+		ret = starpu_task_submit(task);
+		if (ret != 0)
+			return 1;
+	}
+
+	starpu_task_wait_for_all();
+
+	for (i = 0; i < NTASKS; i++)
+	{
+		struct starpu_task_profiling_info *pi;
+		double task_len;
+
+		pi = tasks[i]->profiling_info;
+		task_len = starpu_timing_timespec_delay_us(&pi->start_time, &pi->end_time);
+		if (task_len < TASK_DURATION/2)
+		{
+			FPRINTF(stderr, "Failed with task length: %fµs\n", task_len);
+			return 1;
+		}
+
+		starpu_task_destroy(tasks[i]);
+	}
+
+	starpu_shutdown();
+	return 0;
+}
+
+int
+main(void)
+{
+	int i;
+	int n_policies = sizeof(policies)/sizeof(policies[0]);
+	for (i = 0; i < n_policies; ++i)
+	{
+		struct starpu_sched_policy *policy = policies[i];
+		FPRINTF(stdout, "Running with policy %s.\n",
+			policy->policy_name);
+		int ret;
+		ret = run(policy);
+		if (ret == 1)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}

+ 242 - 0
tests/sched_policies/simple_cpu_gpu_sched.c

@@ -0,0 +1,242 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+
+#include "../helper.h"
+
+/*
+ * Schedulers that are aware of the expected task length provided by the
+ * perfmodels must make sure that :
+ * 	- cpu_task is cheduled on a CPU.
+ * 	- gpu_task is scheduled on a GPU.
+ *
+ * Applies to : heft, XXX : and to what other schedulers ?
+ */
+
+
+static void
+dummy(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+}
+
+/*
+ * Fake cost functions.
+ */
+static double
+cpu_task_cpu(struct starpu_task *task,
+	     enum starpu_perf_archtype arch,
+	     unsigned nimpl)
+{
+	(void) task;
+	(void) arch;
+	(void) nimpl;
+	return 1.0;
+}
+
+static double
+cpu_task_gpu(struct starpu_task *task,
+	     enum starpu_perf_archtype arch,
+	     unsigned nimpl)
+{
+	(void) task;
+	(void) arch;
+	(void) nimpl;
+
+	return 1000.0;
+}
+
+static double
+gpu_task_cpu(struct starpu_task *task,
+	     enum starpu_perf_archtype arch,
+	     unsigned nimpl)
+{
+	(void) task;
+	(void) arch;
+	(void) nimpl;
+
+	return 1000.0;
+}
+
+static double
+gpu_task_gpu(struct starpu_task *task,
+	     enum starpu_perf_archtype arch,
+	     unsigned nimpl)
+{
+	(void) task;
+	(void) arch;
+	(void) nimpl;
+
+	return 1.0;
+}
+
+static struct starpu_perfmodel model_cpu_task = 
+{
+	.type = STARPU_PER_ARCH
+};
+static struct starpu_perfmodel model_gpu_task = 
+{
+	.type = STARPU_PER_ARCH
+};
+
+static void
+init_perfmodels(void)
+{
+	int i;
+	for (i = STARPU_CPU_DEFAULT; i < STARPU_CUDA_DEFAULT; i++)
+	{
+		model_cpu_task.per_arch[i][0].cost_function = cpu_task_cpu;
+		model_gpu_task.per_arch[i][0].cost_function = gpu_task_cpu;
+	}
+	for (i = STARPU_CUDA_DEFAULT; i < STARPU_GORDON_DEFAULT; i++)
+	{
+		model_cpu_task.per_arch[i][0].cost_function = cpu_task_gpu;
+		model_gpu_task.per_arch[i][0].cost_function = gpu_task_gpu;
+	}
+}
+
+/*
+ * Dummy codelets.
+ */
+static struct starpu_codelet cpu_cl =
+{
+	.cpu_funcs    = { dummy, NULL },
+	.cuda_funcs   = { dummy, NULL },
+	.opencl_funcs = { dummy, NULL },
+	.nbuffers     = 0,
+	.model        = &model_cpu_task
+};
+
+static struct starpu_codelet gpu_cl =
+{
+	.cpu_funcs    = { dummy, NULL },
+	.cuda_funcs   = { dummy, NULL },
+	.opencl_funcs = { dummy, NULL },
+	.nbuffers     = 0,
+	.model        = &model_gpu_task
+};
+
+static int
+run(struct starpu_sched_policy *policy)
+{
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.sched_policy = policy;
+	int ret = starpu_init(&conf);
+	if (ret == -ENODEV)
+		exit(STARPU_TEST_SKIPPED);
+
+	/* At least 1 CPU and 1 GPU are needed. */
+	if (starpu_cpu_worker_get_count() == 0)
+		exit(STARPU_TEST_SKIPPED);
+	if (starpu_cuda_worker_get_count() == 0 &&
+	    starpu_opencl_worker_get_count() == 0)
+		exit(STARPU_TEST_SKIPPED);
+
+	starpu_profiling_status_set(1);
+	init_perfmodels();
+
+	struct starpu_task *cpu_task = starpu_task_create();
+	cpu_task->cl = &cpu_cl;
+	cpu_task->destroy = 0;
+
+	struct starpu_task *gpu_task = starpu_task_create();
+	gpu_task->cl = &gpu_cl;
+	gpu_task->destroy = 0;
+
+	ret = starpu_task_submit(cpu_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(gpu_task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_task_wait_for_all();
+
+	enum starpu_archtype cpu_task_worker, gpu_task_worker;
+	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
+	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
+	if (cpu_task_worker != STARPU_CPU_WORKER ||
+	    (gpu_task_worker != STARPU_CUDA_WORKER &&
+	     gpu_task_worker != STARPU_OPENCL_WORKER))
+		ret = 1;
+	else
+		ret = 0;
+
+
+	starpu_task_destroy(cpu_task);
+	starpu_task_destroy(gpu_task);
+	starpu_shutdown();
+	return ret;
+}
+
+/*
+extern struct starpu_sched_policy _starpu_sched_ws_policy;
+extern struct starpu_sched_policy _starpu_sched_prio_policy;
+extern struct starpu_sched_policy _starpu_sched_random_policy;
+extern struct starpu_sched_policy _starpu_sched_dm_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
+extern struct starpu_sched_policy _starpu_sched_eager_policy;
+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
+*/
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
+
+/* XXX: what policies are we interested in ? */
+static struct starpu_sched_policy *policies[] =
+{
+	//&_starpu_sched_ws_policy,
+	//&_starpu_sched_prio_policy,
+	//&_starpu_sched_dm_policy,
+	//&_starpu_sched_dmda_policy,
+	&_starpu_sched_heft_policy,
+	//&_starpu_sched_dmda_ready_policy,
+	//&_starpu_sched_dmda_sorted_policy,
+	//&_starpu_sched_random_policy,
+	//&_starpu_sched_eager_policy,
+	//&_starpu_sched_parallel_heft_policy,
+	//&_starpu_sched_pgreedy_policy
+};
+
+int
+main(void)
+{
+#ifndef STARPU_HAVE_SETENV
+/* XXX: is this macro used by all the schedulers we are interested in ? */
+#warning "setenv() is not available, skipping this test"
+	return STARPU_TEST_SKIPPED;
+#else
+	setenv("STARPU_SCHED_BETA", "0", 1);
+
+	int i;
+	int n_policies = sizeof(policies)/sizeof(policies[0]);
+	for (i = 0; i < n_policies; ++i)
+	{
+		struct starpu_sched_policy *policy = policies[i];
+		FPRINTF(stdout, "Running with policy %s.\n",
+			policy->policy_name);
+		int ret;
+		ret = run(policy);
+		if (ret == 1)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+#endif
+}

+ 136 - 0
tests/sched_policies/simple_deps.c

@@ -0,0 +1,136 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012 Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <unistd.h>
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+
+
+#include "../helper.h"
+
+/*
+ * Task1 must be executed before task0, even if task0 is submitted first.
+ * Applies to : all schedulers.
+ */
+
+static void
+dummy(void *buffers[], void *args)
+{
+	(void) buffers;
+	(void) args;
+	usleep(1000000);
+}
+
+static int
+run(struct starpu_sched_policy *policy)
+{
+	int ret;
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	conf.sched_policy = policy;
+	ret = starpu_init(&conf);
+	if (ret != 0)
+		exit(STARPU_TEST_SKIPPED);
+	starpu_profiling_status_set(1);
+
+	struct starpu_codelet cl =
+	{
+		.cpu_funcs = {dummy, NULL},
+		.nbuffers = 0
+	};
+
+	struct starpu_task *task0 = starpu_task_create();
+	task0->cl = &cl;
+	task0->destroy = 0;
+
+	struct starpu_task *task1 = starpu_task_create();
+	task1->cl = &cl;
+	task1->destroy = 0;
+
+	starpu_task_declare_deps_array(task0, 1, &task1);
+
+	ret = starpu_task_submit(task0);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	ret = starpu_task_submit(task1);
+	if (ret == -ENODEV) goto enodev;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+	starpu_task_wait_for_all();
+
+	double t1, t2;
+	t1 = starpu_timing_timespec_to_us(&task1->profiling_info->end_time);
+	t2 = starpu_timing_timespec_to_us(&task0->profiling_info->start_time);
+
+	starpu_task_destroy(task0);
+	starpu_task_destroy(task1);
+	starpu_shutdown();
+
+	return t1 < t2 ? 0:1;
+
+enodev:
+	starpu_shutdown();
+	return -ENODEV;
+}
+
+extern struct starpu_sched_policy _starpu_sched_ws_policy;
+extern struct starpu_sched_policy _starpu_sched_prio_policy;
+extern struct starpu_sched_policy _starpu_sched_random_policy;
+extern struct starpu_sched_policy _starpu_sched_dm_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
+extern struct starpu_sched_policy _starpu_sched_eager_policy;
+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
+extern struct starpu_sched_policy _starpu_sched_heft_policy;
+
+static struct starpu_sched_policy *policies[] =
+{
+	&_starpu_sched_ws_policy,
+	&_starpu_sched_prio_policy,
+	&_starpu_sched_dm_policy,
+	&_starpu_sched_dmda_policy,
+	&_starpu_sched_heft_policy,
+	&_starpu_sched_dmda_ready_policy,
+	&_starpu_sched_dmda_sorted_policy,
+	&_starpu_sched_random_policy,
+	&_starpu_sched_eager_policy,
+	&_starpu_sched_parallel_heft_policy,
+	&_starpu_sched_pgreedy_policy
+};
+
+int
+main(void)
+{
+	int i;
+	int n_policies = sizeof(policies)/sizeof(policies[0]);
+	for (i = 0; i < n_policies; ++i)
+	{
+		struct starpu_sched_policy *policy = policies[i];
+		FPRINTF(stdout, "Running with policy %s.\n",
+			policy->policy_name);
+		int ret;
+		ret = run(policy);
+		if (ret == -ENODEV)
+			return STARPU_TEST_SKIPPED;
+		if (ret == 1)
+			return EXIT_FAILURE;
+	}
+
+	return EXIT_SUCCESS;
+}

+ 1 - 0
tools/dev/internal/rename_internal.sed

@@ -14,6 +14,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+s/\bheft_policy\b/_starpu_sched_heft_policy/g
 s/\bstruct starpu_priority_taskq_s\b/struct _starpu_priority_taskq/g
 s/\bSTARPU_FUT_APPS_KEY\b/_STARPU_FUT_APPS_KEY/g
 s/\bSTARPU_FUT_CPU_KEY\b/_STARPU_FUT_CPU_KEY/g