12 years ago · 22b0946858
--- a/build-aux/compile
+++ b/build-aux/compile
@@ -1,342 +0,0 @@
 
				-#! /bin/sh
			
 
				-# Wrapper for compilers which do not understand '-c -o'.
			
 
				-
			
 
				-scriptversion=2012-03-05.13; # UTC
			
 
				-
			
 
				-# Copyright (C) 1999-2012 Free Software Foundation, Inc.
			
 
				-# Written by Tom Tromey <tromey@cygnus.com>.
			
 
				-#
			
 
				-# This program is free software; you can redistribute it and/or modify
			
 
				-# it under the terms of the GNU General Public License as published by
			
 
				-# the Free Software Foundation; either version 2, or (at your option)
			
 
				-# any later version.
			
 
				-#
			
 
				-# This program is distributed in the hope that it will be useful,
			
 
				-# but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				-# GNU General Public License for more details.
			
 
				-#
			
 
				-# You should have received a copy of the GNU General Public License
			
 
				-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
			
 
				-
			
 
				-# As a special exception to the GNU General Public License, if you
			
 
				-# distribute this file as part of a program that contains a
			
 
				-# configuration script generated by Autoconf, you may include it under
			
 
				-# the same distribution terms that you use for the rest of that program.
			
 
				-
			
 
				-# This file is maintained in Automake, please report
			
 
				-# bugs to <bug-automake@gnu.org> or send patches to
			
 
				-# <automake-patches@gnu.org>.
			
 
				-
			
 
				-nl='
			
 
				-'
			
 
				-
			
 
				-# We need space, tab and new line, in precisely that order.  Quoting is
			
 
				-# there to prevent tools from complaining about whitespace usage.
			
 
				-IFS=" ""	$nl"
			
 
				-
			
 
				-file_conv=
			
 
				-
			
 
				-# func_file_conv build_file lazy
			
 
				-# Convert a $build file to $host form and store it in $file
			
 
				-# Currently only supports Windows hosts. If the determined conversion
			
 
				-# type is listed in (the comma separated) LAZY, no conversion will
			
 
				-# take place.
			
 
				-func_file_conv ()
			
 
				-{
			
 
				-  file=$1
			
 
				-  case $file in
			
 
				-    / | /[!/]*) # absolute file, and not a UNC file
			
 
				-      if test -z "$file_conv"; then
			
 
				-	# lazily determine how to convert abs files
			
 
				-	case `uname -s` in
			
 
				-	  MINGW*)
			
 
				-	    file_conv=mingw
			
 
				-	    ;;
			
 
				-	  CYGWIN*)
			
 
				-	    file_conv=cygwin
			
 
				-	    ;;
			
 
				-	  *)
			
 
				-	    file_conv=wine
			
 
				-	    ;;
			
 
				-	esac
			
 
				-      fi
			
 
				-      case $file_conv/,$2, in
			
 
				-	*,$file_conv,*)
			
 
				-	  ;;
			
 
				-	mingw/*)
			
 
				-	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
			
 
				-	  ;;
			
 
				-	cygwin/*)
			
 
				-	  file=`cygpath -m "$file" || echo "$file"`
			
 
				-	  ;;
			
 
				-	wine/*)
			
 
				-	  file=`winepath -w "$file" || echo "$file"`
			
 
				-	  ;;
			
 
				-      esac
			
 
				-      ;;
			
 
				-  esac
			
 
				-}
			
 
				-
			
 
				-# func_cl_dashL linkdir
			
 
				-# Make cl look for libraries in LINKDIR
			
 
				-func_cl_dashL ()
			
 
				-{
			
 
				-  func_file_conv "$1"
			
 
				-  if test -z "$lib_path"; then
			
 
				-    lib_path=$file
			
 
				-  else
			
 
				-    lib_path="$lib_path;$file"
			
 
				-  fi
			
 
				-  linker_opts="$linker_opts -LIBPATH:$file"
			
 
				-}
			
 
				-
			
 
				-# func_cl_dashl library
			
 
				-# Do a library search-path lookup for cl
			
 
				-func_cl_dashl ()
			
 
				-{
			
 
				-  lib=$1
			
 
				-  found=no
			
 
				-  save_IFS=$IFS
			
 
				-  IFS=';'
			
 
				-  for dir in $lib_path $LIB
			
 
				-  do
			
 
				-    IFS=$save_IFS
			
 
				-    if $shared && test -f "$dir/$lib.dll.lib"; then
			
 
				-      found=yes
			
 
				-      lib=$dir/$lib.dll.lib
			
 
				-      break
			
 
				-    fi
			
 
				-    if test -f "$dir/$lib.lib"; then
			
 
				-      found=yes
			
 
				-      lib=$dir/$lib.lib
			
 
				-      break
			
 
				-    fi
			
 
				-  done
			
 
				-  IFS=$save_IFS
			
 
				-
			
 
				-  if test "$found" != yes; then
			
 
				-    lib=$lib.lib
			
 
				-  fi
			
 
				-}
			
 
				-
			
 
				-# func_cl_wrapper cl arg...
			
 
				-# Adjust compile command to suit cl
			
 
				-func_cl_wrapper ()
			
 
				-{
			
 
				-  # Assume a capable shell
			
 
				-  lib_path=
			
 
				-  shared=:
			
 
				-  linker_opts=
			
 
				-  for arg
			
 
				-  do
			
 
				-    if test -n "$eat"; then
			
 
				-      eat=
			
 
				-    else
			
 
				-      case $1 in
			
 
				-	-o)
			
 
				-	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
			
 
				-	  eat=1
			
 
				-	  case $2 in
			
 
				-	    *.o | *.[oO][bB][jJ])
			
 
				-	      func_file_conv "$2"
			
 
				-	      set x "$@" -Fo"$file"
			
 
				-	      shift
			
 
				-	      ;;
			
 
				-	    *)
			
 
				-	      func_file_conv "$2"
			
 
				-	      set x "$@" -Fe"$file"
			
 
				-	      shift
			
 
				-	      ;;
			
 
				-	  esac
			
 
				-	  ;;
			
 
				-	-I)
			
 
				-	  eat=1
			
 
				-	  func_file_conv "$2" mingw
			
 
				-	  set x "$@" -I"$file"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	-I*)
			
 
				-	  func_file_conv "${1#-I}" mingw
			
 
				-	  set x "$@" -I"$file"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	-l)
			
 
				-	  eat=1
			
 
				-	  func_cl_dashl "$2"
			
 
				-	  set x "$@" "$lib"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	-l*)
			
 
				-	  func_cl_dashl "${1#-l}"
			
 
				-	  set x "$@" "$lib"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	-L)
			
 
				-	  eat=1
			
 
				-	  func_cl_dashL "$2"
			
 
				-	  ;;
			
 
				-	-L*)
			
 
				-	  func_cl_dashL "${1#-L}"
			
 
				-	  ;;
			
 
				-	-static)
			
 
				-	  shared=false
			
 
				-	  ;;
			
 
				-	-Wl,*)
			
 
				-	  arg=${1#-Wl,}
			
 
				-	  save_ifs="$IFS"; IFS=','
			
 
				-	  for flag in $arg; do
			
 
				-	    IFS="$save_ifs"
			
 
				-	    linker_opts="$linker_opts $flag"
			
 
				-	  done
			
 
				-	  IFS="$save_ifs"
			
 
				-	  ;;
			
 
				-	-Xlinker)
			
 
				-	  eat=1
			
 
				-	  linker_opts="$linker_opts $2"
			
 
				-	  ;;
			
 
				-	-*)
			
 
				-	  set x "$@" "$1"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
			
 
				-	  func_file_conv "$1"
			
 
				-	  set x "$@" -Tp"$file"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
			
 
				-	  func_file_conv "$1" mingw
			
 
				-	  set x "$@" "$file"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-	*)
			
 
				-	  set x "$@" "$1"
			
 
				-	  shift
			
 
				-	  ;;
			
 
				-      esac
			
 
				-    fi
			
 
				-    shift
			
 
				-  done
			
 
				-  if test -n "$linker_opts"; then
			
 
				-    linker_opts="-link$linker_opts"
			
 
				-  fi
			
 
				-  exec "$@" $linker_opts
			
 
				-  exit 1
			
 
				-}
			
 
				-
			
 
				-eat=
			
 
				-
			
 
				-case $1 in
			
 
				-  '')
			
 
				-     echo "$0: No command.  Try '$0 --help' for more information." 1>&2
			
 
				-     exit 1;
			
 
				-     ;;
			
 
				-  -h | --h*)
			
 
				-    cat <<\EOF
			
 
				-Usage: compile [--help] [--version] PROGRAM [ARGS]
			
 
				-
			
 
				-Wrapper for compilers which do not understand '-c -o'.
			
 
				-Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
			
 
				-arguments, and rename the output as expected.
			
 
				-
			
 
				-If you are trying to build a whole package this is not the
			
 
				-right script to run: please start by reading the file 'INSTALL'.
			
 
				-
			
 
				-Report bugs to <bug-automake@gnu.org>.
			
 
				-EOF
			
 
				-    exit $?
			
 
				-    ;;
			
 
				-  -v | --v*)
			
 
				-    echo "compile $scriptversion"
			
 
				-    exit $?
			
 
				-    ;;
			
 
				-  cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
			
 
				-    func_cl_wrapper "$@"      # Doesn't return...
			
 
				-    ;;
			
 
				-esac
			
 
				-
			
 
				-ofile=
			
 
				-cfile=
			
 
				-
			
 
				-for arg
			
 
				-do
			
 
				-  if test -n "$eat"; then
			
 
				-    eat=
			
 
				-  else
			
 
				-    case $1 in
			
 
				-      -o)
			
 
				-	# configure might choose to run compile as 'compile cc -o foo foo.c'.
			
 
				-	# So we strip '-o arg' only if arg is an object.
			
 
				-	eat=1
			
 
				-	case $2 in
			
 
				-	  *.o | *.obj)
			
 
				-	    ofile=$2
			
 
				-	    ;;
			
 
				-	  *)
			
 
				-	    set x "$@" -o "$2"
			
 
				-	    shift
			
 
				-	    ;;
			
 
				-	esac
			
 
				-	;;
			
 
				-      *.c)
			
 
				-	cfile=$1
			
 
				-	set x "$@" "$1"
			
 
				-	shift
			
 
				-	;;
			
 
				-      *)
			
 
				-	set x "$@" "$1"
			
 
				-	shift
			
 
				-	;;
			
 
				-    esac
			
 
				-  fi
			
 
				-  shift
			
 
				-done
			
 
				-
			
 
				-if test -z "$ofile" || test -z "$cfile"; then
			
 
				-  # If no '-o' option was seen then we might have been invoked from a
			
 
				-  # pattern rule where we don't need one.  That is ok -- this is a
			
 
				-  # normal compilation that the losing compiler can handle.  If no
			
 
				-  # '.c' file was seen then we are probably linking.  That is also
			
 
				-  # ok.
			
 
				-  exec "$@"
			
 
				-fi
			
 
				-
			
 
				-# Name of file we expect compiler to create.
			
 
				-cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
			
 
				-
			
 
				-# Create the lock directory.
			
 
				-# Note: use '[/\\:.-]' here to ensure that we don't use the same name
			
 
				-# that we are using for the .o file.  Also, base the name on the expected
			
 
				-# object file name, since that is what matters with a parallel build.
			
 
				-lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
			
 
				-while true; do
			
 
				-  if mkdir "$lockdir" >/dev/null 2>&1; then
			
 
				-    break
			
 
				-  fi
			
 
				-  sleep 1
			
 
				-done
			
 
				-# FIXME: race condition here if user kills between mkdir and trap.
			
 
				-trap "rmdir '$lockdir'; exit 1" 1 2 15
			
 
				-
			
 
				-# Run the compile.
			
 
				-"$@"
			
 
				-ret=$?
			
 
				-
			
 
				-if test -f "$cofile"; then
			
 
				-  test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
			
 
				-elif test -f "${cofile}bj"; then
			
 
				-  test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
			
 
				-fi
			
 
				-
			
 
				-rmdir "$lockdir"
			
 
				-exit $ret
			
 
				-
			
 
				-# Local Variables:
			
 
				-# mode: shell-script
			
 
				-# sh-indentation: 2
			
 
				-# eval: (add-hook 'write-file-hooks 'time-stamp)
			
 
				-# time-stamp-start: "scriptversion="
			
 
				-# time-stamp-format: "%:y-%02m-%02d.%02H"
			
 
				-# time-stamp-time-zone: "UTC"
			
 
				-# time-stamp-end: "; # UTC"
			
 
				-# End:
			
--- a/configure.ac
+++ b/configure.ac
@@ -171,6 +171,9 @@ fi
 
				 # Some systems do not define strerror_r
			
 
				 AC_CHECK_FUNC([strerror_r], [AC_DEFINE([STARPU_HAVE_STRERROR_R], [1], [Define to 1 if the function strerro_r is available.])])
			
 
				 
			
 
				+# Some systems may not define setenv
			
 
				+AC_CHECK_FUNC([setenv], [AC_DEFINE([STARPU_HAVE_SETENV], [1], [Define to 1 if the function setenv is available.])])
			
 
				+
			
 
				 # Some systems do not define unsetenv
			
 
				 AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
			
 
				 
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -60,7 +60,7 @@ uninstall-local:
 
				 #	vector_scal_c.texi vector_scal_cuda.texi vector_scal_opencl.texi vector_scal_opencl_codelet.texi
			
 
				 
			
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				-PUBLISHHOST	= sync
			
 
				+PUBLISHHOST	?= sync
			
 
				 update-web: starpu.html
			
 
				 	sed -i 's/gcc\.html#Attribute-Syntax/http:\/\/gcc.gnu.org\/onlinedocs\/gcc\/Attribute-Syntax.html#Attribute-Syntax/' starpu.html
			
 
				 	scp starpu.pdf starpu.html $(PUBLISHHOST):/web/runtime/html/StarPU
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -11,9 +11,10 @@
 
				 * Multiformat Data Interface::  
			
 
				 * Task Bundles::                
			
 
				 * Task Lists::                  
			
 
				-* Using Parallel Tasks::       
			
 
				-* Scheduling Contexts::
			
 
				+* Using Parallel Tasks::        
			
 
				+* Scheduling Contexts::         
			
 
				 * Defining a new scheduling policy::  
			
 
				+* Running drivers::             
			
 
				 * Expert mode::                 
			
 
				 @end menu
			
 
				 
			
@@ -792,6 +793,56 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				+@node Running drivers
			
 
				+@section Running drivers
			
 
				+
			
 
				+@menu
			
 
				+* Driver API::
			
 
				+* Running drivers Example::
			
 
				+@end menu
			
 
				+
			
 
				+@node Driver API
			
 
				+@subsection Driver API
			
 
				+
			
 
				+@deftypefun int starpu_driver_init (struct starpu_driver *@var{d})
			
 
				+Initialize the given driver. Returns 0 on success, -EINVAL if d->type is not
			
 
				+STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_driver_run_once (struct starpu_driver *@var{d})
			
 
				+Runs the driver for a while, then returns 0 on success, -EINVAL if d->type is
			
 
				+not STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_driver_deinit (struct starpu_driver *@var{d})
			
 
				+Deinitialize the given driver. Returns 0 on success, -EINVAL if d->type is not
			
 
				+STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Running drivers Example
			
 
				+@subsection Example
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int ret;
			
 
				+struct starpu_driver = @{
			
 
				+    .type = STARPU_CUDA_WORKER,
			
 
				+    .id.cuda_id = 0
			
 
				+@};
			
 
				+ret = starpu_driver_init(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+while (some_condition) @{
			
 
				+    ret = starpu_driver_run_once(&d);
			
 
				+    if (ret != 0)
			
 
				+        error();
			
 
				+@}
			
 
				+ret = starpu_driver_deinit(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				 @node Expert mode
			
 
				 @section Expert mode
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -196,7 +196,6 @@ examplebin_PROGRAMS +=				\
 
				 	scheduler/dummy_sched			\
			
 
				 	reductions/dot_product			\
			
 
				 	reductions/minmax_reduction		\
			
 
				-	mandelbrot/mandelbrot			\
			
 
				 	ppm_downscaler/ppm_downscaler		\
			
 
				 	ppm_downscaler/yuv_downscaler
			
 
				 
			
@@ -780,12 +779,6 @@ endif
 
				 # Mandelbrot Set #
			
 
				 ##################
			
 
				 
			
 
				-mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
			
 
				-if HAVE_X11
			
 
				-mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
			
 
				-mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) $(X_EXTRA_LIBS) -lX11
			
 
				-endif
			
 
				-
			
 
				 ################
			
 
				 # Top Examples #
			
 
				 ################
			
--- a/examples/lu/xlu_pivot.c
+++ b/examples/lu/xlu_pivot.c
@@ -380,7 +380,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	double timing;
			
 
				+	double timing=0.0;
			
 
				 	int ret = dw_codelet_facto_pivot(&dataA, piv_description, nblocks, get_block_with_striding, &timing);
			
 
				 
			
 
				 	FPRINTF(stderr, "Computation took (in ms)\n");
			
@@ -435,7 +435,7 @@ int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, uns
 
				 		piv_description[block].last = (block + 1) * (size / nblocks);
			
 
				 	}
			
 
				 
			
 
				-	double timing;
			
 
				+	double timing=0.0;
			
 
				 	int ret = dw_codelet_facto_pivot(dataAp, piv_description, nblocks, get_block_with_no_striding, &timing);
			
 
				 
			
 
				 	FPRINTF(stderr, "Computation took (in ms)\n");
			
--- a/gcc-plugin/tests/Makefile.am
+++ b/gcc-plugin/tests/Makefile.am
@@ -15,20 +15,31 @@
 
				 
			
 
				 
			
 
				 gcc_tests =					\
			
 
				+  base.c					\
			
 
				+  pointers.c					\
			
 
				+  output-pointer.c				\
			
 
				   output-pointer-errors.c			\
			
 
				+  register.c					\
			
 
				   register-errors.c				\
			
 
				+  acquire.c					\
			
 
				   acquire-errors.c				\
			
 
				+  release.c					\
			
 
				   release-errors.c				\
			
 
				+  unregister.c					\
			
 
				   unregister-errors.c				\
			
 
				   task-errors.c					\
			
 
				   scalar-tasks.c				\
			
 
				   pointer-tasks.c				\
			
 
				   external-task-impl.c				\
			
 
				   no-initialize.c				\
			
 
				+  lib-user.c					\
			
 
				   wait-errors.c					\
			
 
				+  heap-allocated.c				\
			
 
				   heap-allocated-errors.c			\
			
 
				   verbose.c					\
			
 
				   debug-tree.c					\
			
 
				+  opencl.c					\
			
 
				+  opencl-errors.c				\
			
 
				   shutdown-errors.c
			
 
				 
			
 
				 EXTRA_DIST =
			
@@ -42,35 +53,11 @@ gcc_tests += opencl-types.c
 
				 # This test simulates errors when lacking an OpenCL implementation.
			
 
				 gcc_tests += opencl-lacking.c
			
 
				 
			
 
				-gcc_tests +=  					\
			
 
				-  base.c 					\
			
 
				-  pointers.c 					\
			
 
				-  output-pointer.c				\
			
 
				-  register.c					\
			
 
				-  acquire.c					\
			
 
				-  release.c					\
			
 
				-  unregister.c					\
			
 
				-  lib-user.c					\
			
 
				-  heap-allocated.c				\
			
 
				-  opencl.c					\
			
 
				-  opencl-errors.c
			
 
				-
			
 
				 else STARPU_USE_OPENCL
			
 
				 
			
 
				 EXTRA_DIST +=					\
			
 
				-  base.c					\
			
 
				-  pointers.c					\
			
 
				   opencl-types.c				\
			
 
				-  opencl-lacking.c				\
			
 
				-  output-pointer.c				\
			
 
				-  register.c					\
			
 
				-  acquire.c					\
			
 
				-  release.c					\
			
 
				-  unregister.c					\
			
 
				-  lib-user.c					\
			
 
				-  heap-allocated.c				\
			
 
				-  opencl.c					\
			
 
				-  opencl-errors.c
			
 
				+  opencl-lacking.c
			
 
				 
			
 
				 endif STARPU_USE_OPENCL
			
 
				 
			
--- a/gcc-plugin/tests/mocks.h
+++ b/gcc-plugin/tests/mocks.h
@@ -424,13 +424,19 @@ starpu_free (void *ptr)
 
				 
			
 
				 /* OpenCL support.  */
			
 
				 
			
 
				-#define STARPU_USE_OPENCL 1
			
 
				+#ifndef STARPU_USE_OPENCL
			
 
				 
			
 
				+# define STARPU_USE_OPENCL 1
			
 
				+
			
 
				+/* The `opencl' pragma needs this structure, so make sure it's defined.  */
			
 
				 struct starpu_opencl_program
			
 
				 {
			
 
				   /* Nothing.  */
			
 
				 };
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				 /* Number of `load_opencl_from_string' calls.  */
			
 
				 static unsigned int load_opencl_calls;
			
 
				 
			
--- a/gcc-plugin/tests/run-test.in
+++ b/gcc-plugin/tests/run-test.in
@@ -73,6 +73,9 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 
				 (define %cuda-cppflags
			
 
				   (string-tokenize "@STARPU_CUDA_CPPFLAGS@"))
			
 
				 
			
 
				+(define %opencl-cppflags
			
 
				+  (string-tokenize "@STARPU_OPENCL_CPPFLAGS@"))
			
 
				+
			
 
				 (define %default-cflags
			
 
				   `("-I" ,%srcdir
			
 
				     "-I" ,(string-append %srcdir "/../../src")    ; for <common/uthash.h>
			
@@ -81,6 +84,7 @@ exec "${GUILE-@GUILE@}" -l "$0"    \
 
				     "-I" ,(string-append %builddir "/../..")
			
 
				 
			
 
				     ,@%cuda-cppflags
			
 
				+    ,@%opencl-cppflags
			
 
				 
			
 
				     ;; Unfortunately `libtool --mode=execute' doesn't help here, so hard-code
			
 
				     ;; the real file name.
			
--- a/socl/examples/Makefile.am
+++ b/socl/examples/Makefile.am
@@ -43,18 +43,21 @@ examplebin_PROGRAMS =
 
				 
			
 
				 examplebin_PROGRAMS +=		\
			
 
				 	basic/basic		\
			
 
				-	clinfo/clinfo
			
 
				+	clinfo/clinfo \
			
 
				+  matmul/matmul
			
 
				 
			
 
				 #	mandelbrot/mandelbrot
			
 
				 
			
 
				 SOCL_EXAMPLES +=		\
			
 
				 	basic/basic		\
			
 
				-	clinfo/clinfo
			
 
				+	clinfo/clinfo\
			
 
				+  matmul/matmul
			
 
				 
			
 
				 #	mandelbrot/mandelbrot
			
 
				 
			
 
				 basic_basic_SOURCES = basic/basic.c
			
 
				 clinfo_clinfo_SOURCES = clinfo/clinfo.c
			
 
				+matmul_matmul_SOURCES = matmul/matmul.c
			
 
				 #mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
			
 
				 
			
 
				 #mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
			
--- a/socl/examples/matmul/matmul.c
+++ b/socl/examples/matmul/matmul.c
@@ -0,0 +1,477 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.

			
 
				+ *

			
 
				+ * Copyright (C) 2010,2011 University of Bordeaux

			
 
				+ *

			
 
				+ * StarPU is free software; you can redistribute it and/or modify

			
 
				+ * it under the terms of the GNU Lesser General Public License as published by

			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at

			
 
				+ * your option) any later version.

			
 
				+ *

			
 
				+ * StarPU is distributed in the hope that it will be useful, but

			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of

			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

			
 
				+ *

			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.

			
 
				+ */

			
 
				+

			
 
				+#include <CL/cl.h>

			
 
				+#include <stdio.h>

			
 
				+#include <string.h>

			
 
				+#include <stdlib.h>

			
 
				+#include <stdint.h>

			
 
				+#include <unistd.h>

			
 
				+#include <assert.h>

			
 
				+#include <math.h>

			
 
				+#include <sys/time.h>

			
 
				+

			
 
				+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)

			
 
				+#define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)

			
 
				+#define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }

			
 
				+

			
 
				+// Thread block size

			
 
				+#define BLOCK_SIZE 16  // Kernel thread-block size

			
 
				+#define WORK_SIZE 64  // Kernel global size in lines of A (or C)

			
 
				+#define TYPE float

			
 
				+

			
 
				+// Basic Matrix dimensions

			
 
				+#define WA (1024L * BLOCK_SIZE) // Matrix A width

			
 
				+#define HA (512L * BLOCK_SIZE) // Matrix A height

			
 
				+#define WB (1024L * BLOCK_SIZE) // Matrix B width

			
 
				+#define HB WA  // Matrix B height

			
 
				+#define WC WB  // Matrix C width 

			
 
				+#define HC HA  // Matrix C height

			
 
				+#define BLOCKS (HA / WORK_SIZE)

			
 
				+

			
 
				+////////////////////////////////////////////////////////////////////////////////

			
 
				+// declaration, forward

			
 
				+void printDiff(TYPE*, TYPE*, int, int, int, TYPE);

			
 
				+void computeReference(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);

			
 
				+

			
 
				+#define str(x) #x

			
 
				+

			
 
				+#define CODE "\

			
 
				+#define TYPE float\n\

			
 
				+__kernel void sgemmNN(int wa, int ha, int wb,  __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\

			
 
				+#define BS 16\n\

			
 
				+#define BLOCK_SIZE 16\n\

			
 
				+  int bx = get_group_id(0);\n\

			
 
				+  int by = get_group_id(1);\n\

			
 
				+  \n\

			
 
				+  int tx = get_local_id(0);\n\

			
 
				+  int ty = get_local_id(1);\n\

			
 
				+  \n\

			
 
				+  int gx = get_global_id(0);\n\

			
 
				+  int gy = get_global_id(1);\n\

			
 
				+    __local float As[BS][BS+1];\

			
 
				+    __local float Bs[BS][BS+1];\

			
 
				+  \n\

			
 
				+  unsigned int block_w = min(wb - bx * BLOCK_SIZE, BLOCK_SIZE);\n\

			
 
				+  unsigned int block_h = min(ha - by * BLOCK_SIZE, BLOCK_SIZE);\n\

			
 
				+  \n\

			
 
				+  int valid = (gx < wb && gy < ha);\n\

			
 
				+  \n\

			
 
				+  TYPE Csub = (TYPE)0.0;\n\

			
 
				+  \n\

			
 
				+  int pos = 0;\n\

			
 
				+  while (pos < wa) {\n\

			
 
				+    unsigned int size = min(wa-pos, BLOCK_SIZE);\n\

			
 
				+    if (tx < size && gy < ha)\n\

			
 
				+      As[tx][ty] = A[pos + tx + wa * gy];\n\

			
 
				+    if (ty < size && gx < wb)\n\

			
 
				+      Bs[tx][ty] = B[gx + wb * (pos+ty)];\n\

			
 
				+    \n\

			
 
				+    barrier(CLK_LOCAL_MEM_FENCE);\n\

			
 
				+    \n\

			
 
				+    if (valid) {\n\

			
 
				+      for (int k = 0; k < size; ++k)\n\

			
 
				+        Csub += As[k][ty] * Bs[tx][k];\n\

			
 
				+    }\n\

			
 
				+    pos += size;\n\

			
 
				+    barrier(CLK_LOCAL_MEM_FENCE);\n\

			
 
				+  }\n\

			
 
				+  \n\

			
 
				+  if (valid)\n\

			
 
				+    C[wb * gy + gx] = Csub;\n\

			
 
				+}"

			
 
				+

			
 
				+static char * code =  CODE;

			
 
				+

			
 
				+int check = 0;

			
 
				+

			
 
				+static void __attribute__((unused)) parse_args(int argc, char **argv)

			
 
				+{

			
 
				+	int i;

			
 
				+	for (i = 1; i < argc; i++)

			
 
				+	{

			
 
				+		if (strcmp(argv[i], "-check") == 0)

			
 
				+		{

			
 
				+			check = 1;

			
 
				+		}

			
 
				+

			
 
				+		if (strcmp(argv[i], "-h") == 0)

			
 
				+		{

			
 
				+			printf("usage : %s [-check]\n", argv[0]);

			
 
				+		}

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+#define shrLog(...) fprintf(stderr, __VA_ARGS__);

			
 
				+

			
 
				+// Round Up Division function

			
 
				+size_t shrRoundUp(int group_size, int global_size) {

			
 
				+	int r = global_size % group_size;

			
 
				+	if(r == 0) {

			
 
				+		return global_size;

			
 
				+	} else {

			
 
				+		return global_size + group_size - r;

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+void fillArray(TYPE* pfData, int iSize) {

			
 
				+	int i;

			
 
				+	const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);

			
 
				+	for (i = 0; i < iSize; ++i) {

			
 
				+		pfData[i] = fScale * rand();

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+void shrPrintArray(float* pfData, int iSize) {

			
 
				+	int i;

			
 
				+	for (i = 0; i < iSize; ++i) {

			
 
				+		shrLog("%d: %.3f\n", i, pfData[i]);

			
 
				+	}

			
 
				+}

			
 
				+

			
 
				+/**

			
 
				+ * Compare two float arrays using L2-norm with an epsilon tolerance for equality

			
 
				+ * @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE

			
 
				+ * @param reference  handle to the reference data / gold image

			
 
				+ * @param data       handle to the computed data

			
 
				+ * @param len        number of elements in reference and data

			
 
				+ * @param epsilon    epsilon to use for the comparison

			
 
				+*/

			
 
				+int shrCompareL2fe( const float* reference, const float* data, const unsigned int len, const float epsilon ) {

			
 
				+	assert(epsilon >= 0);

			
 
				+

			
 
				+	float error = 0;

			
 
				+	float ref = 0;

			
 
				+

			
 
				+	unsigned int i;

			
 
				+	for(i = 0; i < len; ++i) {

			
 
				+		float diff = reference[i] - data[i];

			
 
				+		error += diff * diff;

			
 
				+		ref += reference[i] * reference[i];

			
 
				+	}

			
 
				+

			
 
				+	float normRef = sqrtf(ref);

			
 
				+	if (fabs(ref) < 1e-7) {

			
 
				+#ifdef _DEBUG

			
 
				+		fprintf(stderr, "ERROR, reference l2-norm is 0\n");

			
 
				+#endif

			
 
				+		return 0;

			
 
				+	}

			
 
				+	float normError = sqrtf(error);

			
 
				+	error = normError / normRef;

			
 
				+	int result = error < epsilon;

			
 
				+#ifdef _DEBUG

			
 
				+	if( !result) {

			
 
				+		fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);

			
 
				+	}

			
 
				+#endif

			
 
				+

			
 
				+	return result;

			
 
				+}

			
 
				+

			
 
				+

			
 
				+int main(int argc, const char** argv) {

			
 
				+	cl_uint platform_count;

			
 
				+	cl_platform_id platforms[5];

			
 
				+

			
 
				+	cl_int err = CL_SUCCESS;

			
 
				+	unsigned int i, p;

			
 
				+

			
 
				+	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;

			
 
				+

			
 
				+	void * ptrs[BLOCKS];

			
 
				+	cl_mem d_A[BLOCKS];

			
 
				+	cl_mem d_C[BLOCKS];

			
 
				+	cl_mem d_B[BLOCKS];

			
 
				+

			
 
				+	cl_event GPUDone[BLOCKS];

			
 
				+	cl_event GPUExecution[BLOCKS];

			
 
				+	struct timeval start, end;

			
 
				+

			
 
				+	int workOffset[BLOCKS];

			
 
				+	int workSize[BLOCKS];

			
 
				+

			
 
				+	unsigned int sizePerGPU = HC / BLOCKS;

			
 
				+	unsigned int sizeMod = HC % BLOCKS;

			
 
				+

			
 
				+	size_t A_size = WA * HA;

			
 
				+	size_t A_mem_size = sizeof(TYPE) * A_size;

			
 
				+	TYPE* A_data;

			
 
				+

			
 
				+	size_t B_size = WB * HB;

			
 
				+	size_t B_mem_size = sizeof(TYPE) * B_size;

			
 
				+	TYPE* B_data;

			
 
				+

			
 
				+	size_t C_size = WC * HC;

			
 
				+	size_t C_mem_size = sizeof(TYPE) * C_size;

			
 
				+	TYPE* C_data;

			
 
				+

			
 
				+	parse_args(argc, argv);

			
 
				+

			
 
				+	check(clGetPlatformIDs(5, platforms, &platform_count));

			
 
				+	if (platform_count == 0)

			
 
				+		error("No platform found\n");

			
 
				+

			
 
				+	cl_uint device_count;

			
 
				+	cl_uint devs[platform_count];

			
 
				+	cl_device_id * devices[platform_count];

			
 
				+	cl_context ctx[platform_count];

			
 
				+	cl_command_queue * commandQueue[platform_count];

			
 
				+

			
 
				+	device_count = 0;

			
 
				+	for (p=0; p<platform_count; p++) {

			
 
				+		cl_platform_id platform = platforms[p];

			
 
				+

			
 
				+		cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);

			
 
				+		if (err == CL_DEVICE_NOT_FOUND) {

			
 
				+			devs[p] = 0;

			
 
				+			continue;

			
 
				+		}

			
 
				+		check(err);

			
 
				+		if (devs[p] == 0)

			
 
				+			continue;

			
 
				+

			
 
				+		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);

			
 
				+		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);

			
 
				+

			
 
				+		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));

			
 
				+

			
 
				+		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};

			
 
				+		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));

			
 
				+

			
 
				+		for(i = 0; i < devs[p]; ++i) 

			
 
				+		{

			
 
				+			cl_device_id device = devices[p][i];

			
 
				+			char name[2048];

			
 
				+			name[0] = '\0';

			
 
				+			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);

			
 
				+			printf("Device %d: %s\n", i, name);

			
 
				+

			
 
				+			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));

			
 
				+		}

			
 
				+

			
 
				+		device_count += devs[p];

			
 
				+	}

			
 
				+

			
 
				+	if (device_count == 0)

			
 
				+		error("No device found\n");

			
 
				+

			
 
				+

			
 
				+

			
 
				+	cl_kernel multiplicationKernel[platform_count];

			
 
				+

			
 
				+	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", 

			
 
				+			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);

			
 
				+

			
 
				+	// allocate host memory for matrices A, B and C

			
 
				+	A_data = (TYPE*)malloc(A_mem_size);

			
 
				+	if (A_data == NULL) {

			
 
				+		perror("malloc");

			
 
				+	}

			
 
				+

			
 
				+	B_data = (TYPE*)malloc(B_mem_size);

			
 
				+	if (B_data == NULL) {

			
 
				+		perror("malloc");

			
 
				+	}

			
 
				+

			
 
				+	C_data = (TYPE*) malloc(C_mem_size);

			
 
				+	if (C_data == NULL) {

			
 
				+		perror("malloc");

			
 
				+	}

			
 
				+

			
 
				+	cl_program program[platform_count];

			
 
				+

			
 
				+	for (p=0; p<platform_count; p++) {

			
 
				+		if (devs[p] == 0)

			
 
				+			continue;

			
 
				+

			
 
				+		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));

			
 
				+

			
 
				+		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));

			
 
				+

			
 
				+		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));

			
 
				+	}

			
 
				+

			
 
				+	printf("Initializing data...\n");

			
 
				+	srand(2008);

			
 
				+	fillArray(A_data, A_size);

			
 
				+	fillArray(B_data, B_size);

			
 
				+	memset(C_data, 0, C_size);

			
 
				+

			
 
				+

			
 
				+	printf("Computing...\n");

			
 
				+	workOffset[0] = 0;

			
 
				+	gettimeofday(&start, NULL);

			
 
				+

			
 
				+	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};

			
 
				+	int c = 0;

			
 
				+	for (p=0; p<platform_count;p++) {

			
 
				+		for (i=0; i<devs[p]; i++) {

			
 
				+			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));

			
 
				+			c++;

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	for(i=0; i < BLOCKS; ++i) 

			
 
				+	{

			
 
				+		int d = i % device_count;

			
 
				+		cl_uint p = 0;

			
 
				+

			
 
				+		// determine device platform

			
 
				+		int dev = d;

			
 
				+		for (p = 0; p < platform_count; p++) {

			
 
				+			if ((cl_int)(dev - devs[p]) < 0)

			
 
				+				break;

			
 
				+			dev -= devs[p];

			
 
				+		}

			
 
				+

			
 
				+		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;        

			
 
				+

			
 
				+		check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));

			
 
				+		check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

			
 
				+

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));

			
 
				+		check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));

			
 
				+

			
 
				+		size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};

			
 
				+

			
 
				+		check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

			
 
				+

			
 
				+		// Non-blocking copy of result from device to host

			
 
				+		check2(ptrs[i] = clEnqueueMapBuffer(commandQueue[p][dev], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));

			
 
				+

			
 
				+		if(i+1 < BLOCKS)

			
 
				+			workOffset[i + 1] = workOffset[i] + workSize[i];

			
 
				+	}

			
 
				+

			
 
				+

			
 
				+	// CPU sync with GPU

			
 
				+	for (p=0; p<platform_count;p++) {

			
 
				+		cl_uint dev;

			
 
				+		for (dev=0; dev<devs[p]; dev++) {

			
 
				+			clFinish(commandQueue[p][dev]);

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	gettimeofday(&end, NULL);

			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

			
 
				+

			
 
				+	double dSeconds = timing/1000/1000;

			
 
				+	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;

			
 
				+	double gflops = 1.0e-9 * dNumOps/dSeconds;

			
 
				+

			
 
				+	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", 

			
 
				+			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);

			
 
				+

			
 
				+	for (i=0; i<device_count; i++) {

			
 
				+		clReleaseMemObject(d_B[i]);

			
 
				+	}

			
 
				+

			
 
				+	for(i = 0; i < BLOCKS; i++) 

			
 
				+	{

			
 
				+		clReleaseMemObject(d_A[i]);

			
 
				+		clReleaseMemObject(d_C[i]);

			
 
				+		clReleaseEvent(GPUExecution[i]);

			
 
				+		clReleaseEvent(GPUDone[i]);

			
 
				+	}

			
 
				+

			
 
				+

			
 
				+	// compute reference solution

			
 
				+	if (check) {

			
 
				+		printf("Comparing results with CPU computation... ");

			
 
				+		TYPE* reference = (TYPE*)malloc(C_mem_size);

			
 
				+		computeReference(reference, A_data, B_data, HA, WA, WB);

			
 
				+

			
 
				+		// check result

			
 
				+		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);

			
 
				+		if (res == 0) {

			
 
				+			printf("\n\n");

			
 
				+			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);

			
 
				+		}

			
 
				+		else printf("PASSED\n\n");

			
 
				+		free(reference);

			
 
				+	}

			
 
				+

			
 
				+	for (p=0; p<platform_count;p++) {

			
 
				+		if (devs[p] == 0)

			
 
				+			continue;

			
 
				+

			
 
				+		check(clReleaseKernel(multiplicationKernel[p]));

			
 
				+		check(clReleaseProgram(program[p]));

			
 
				+		check(clReleaseContext(ctx[p]));

			
 
				+		cl_uint k;

			
 
				+		for(k = 0; k < devs[p]; ++k) 

			
 
				+		{

			
 
				+			check(clReleaseCommandQueue(commandQueue[p][k]));

			
 
				+		}

			
 
				+	}

			
 
				+

			
 
				+	free(A_data);

			
 
				+	free(B_data);

			
 
				+	free(C_data);

			
 
				+

			
 
				+	return 0;

			
 
				+}

			
 
				+

			
 
				+void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol) {

			
 
				+	shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);

			
 
				+	int i,j,k;

			
 
				+	int error_count=0;

			
 
				+	for (j = 0; j < height; j++) {

			
 
				+		if (error_count < iListLength) {

			
 
				+			shrLog("\n  Row %d:\n", j);

			
 
				+		}

			
 
				+		for (i = 0; i < width; i++) {

			
 
				+			k = j * width + i;

			
 
				+			float fDiff = fabs(data1[k] - data2[k]);

			
 
				+			if (fDiff > fListTol) {                

			
 
				+				if (error_count < iListLength) {

			
 
				+					shrLog("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);

			
 
				+				}

			
 
				+				error_count++;

			
 
				+			}

			
 
				+		}

			
 
				+	}

			
 
				+	shrLog(" \n  Total Errors = %d\n\n", error_count);

			
 
				+}

			
 
				+

			
 
				+/**

			
 
				+ * Compute reference data set

			
 
				+ * C = A * B

			
 
				+ * @param C          reference data, computed but preallocated

			
 
				+ * @param A          matrix A as provided to device

			
 
				+ * @param B          matrix B as provided to device

			
 
				+ * @param hA         height of matrix A

			
 
				+ * @param wB         width of matrix B

			
 
				+*/

			
 
				+void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB) {

			
 
				+	unsigned int i,j,k;

			
 
				+	for (i = 0; i < hA; ++i)

			
 
				+		for (j = 0; j < wB; ++j) {

			
 
				+			double sum = 0;

			
 
				+			for (k = 0; k < wA; ++k) {

			
 
				+				double a = A[i * wA + k];

			
 
				+				double b = B[k * wB + j];

			
 
				+				sum += a * b;

			
 
				+			}

			
 
				+			C[i * wB + j] = (TYPE)sum;

			
 
				+		}

			
 
				+}

			
 
				+

			
--- a/socl/src/init.c
+++ b/socl/src/init.c
@@ -28,9 +28,6 @@ __attribute__((constructor)) static void socl_init() {
 
				   struct starpu_conf conf;
			
 
				   starpu_conf_init(&conf);
			
 
				   conf.ncuda = 0;
			
 
				-  putenv("STARPU_NCUDA=0");
			
 
				-  putenv("STARPU_NOPENCL=1");
			
 
				-  putenv("STARPU_NCPUS=1");
			
 
				 
			
 
				   mem_object_init();
			
 
				 
			
--- a/src/core/dependencies/data_concurrency.c
+++ b/src/core/dependencies/data_concurrency.c
@@ -260,8 +260,11 @@ static unsigned unlock_one_requester(struct _starpu_data_requester *r)
 
				 		return 0;
			
 
				 }
			
 
				 
			
 
				-/* The header lock must already be taken by the caller */
			
 
				-void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
			
 
				+/* The header lock must already be taken by the caller.
			
 
				+ * This may free the handle if it was lazily unregistered (1 is returned in
			
 
				+ * that case). The handle pointer thus becomes invalid for the caller.
			
 
				+ */
			
 
				+int _starpu_notify_data_dependencies(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	_starpu_spin_checklocked(&handle->header_lock);
			
 
				 	/* A data access has finished so we remove a reference. */
			
@@ -269,19 +272,9 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 
				 	handle->refcnt--;
			
 
				 	STARPU_ASSERT(handle->busy_count > 0);
			
 
				 	handle->busy_count--;
			
 
				-	_starpu_data_check_not_busy(handle);
			
 
				-
			
 
				-	/* The handle has been destroyed in between (eg. this was a temporary
			
 
				-	 * handle created for a reduction.) */
			
 
				-	if (handle->lazy_unregister && handle->refcnt == 0)
			
 
				-	{
			
 
				-		_starpu_spin_unlock(&handle->header_lock);
			
 
				-		starpu_data_unregister_no_coherency(handle);
			
 
				-		/* Warning: in case we unregister the handle, we must be sure
			
 
				-		 * that the caller will not try to unlock the header after
			
 
				-		 * !*/
			
 
				-		return;
			
 
				-	}
			
 
				+	if (_starpu_data_check_not_busy(handle))
			
 
				+		/* Handle was destroyed, nothing left to do.  */
			
 
				+		return 1;
			
 
				 
			
 
				 	/* In case there is a pending reduction, and that this is the last
			
 
				 	 * requester, we may go back to a "normal" coherency model. */
			
@@ -358,7 +351,10 @@ void _starpu_notify_data_dependencies(starpu_data_handle_t handle)
 
				 			_starpu_spin_lock(&handle->header_lock);
			
 
				 			STARPU_ASSERT(handle->busy_count > 0);
			
 
				 			handle->busy_count--;
			
 
				-			_starpu_data_check_not_busy(handle);
			
 
				+			if (_starpu_data_check_not_busy(handle))
			
 
				+				return 1;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
--- a/src/core/dependencies/data_concurrency.h
+++ b/src/core/dependencies/data_concurrency.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -22,7 +22,7 @@
 
				 
			
 
				 unsigned _starpu_submit_job_enforce_data_deps(struct _starpu_job *j);
			
 
				 
			
 
				-void _starpu_notify_data_dependencies(starpu_data_handle_t handle);
			
 
				+int _starpu_notify_data_dependencies(starpu_data_handle_t handle);
			
 
				 
			
 
				 unsigned _starpu_attempt_to_submit_data_request_from_apps(starpu_data_handle_t handle,
			
 
				 							  enum starpu_access_mode mode,
			
--- a/src/core/dependencies/implicit_data_deps.c
+++ b/src/core/dependencies/implicit_data_deps.c
@@ -405,6 +405,41 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->sequential_consistency_mutex);
			
 
				 }
			
 
				 
			
 
				+/* This is the same as _starpu_release_data_enforce_sequential_consistency, but
			
 
				+ * for all data of a task */
			
 
				+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j)
			
 
				+{
			
 
				+	struct starpu_task *task = j->task;
			
 
				+        struct starpu_buffer_descr *descrs = j->ordered_buffers;
			
 
				+
			
 
				+	if (!task->cl)
			
 
				+		return;
			
 
				+
			
 
				+        unsigned nbuffers = task->cl->nbuffers;
			
 
				+
			
 
				+	unsigned index;
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		starpu_data_handle_t handle = descrs[index].handle;
			
 
				+
			
 
				+		if (index && descrs[index-1].handle == descrs[index].handle)
			
 
				+			/* We have already released this data, skip it. This
			
 
				+			 * depends on ordering putting writes before reads, see
			
 
				+			 * _starpu_compar_handles */
			
 
				+			continue;
			
 
				+
			
 
				+		_starpu_release_data_enforce_sequential_consistency(task, handle);
			
 
				+		/* Release the reference acquired in _starpu_push_task_output */
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				+		if (!_starpu_data_check_not_busy(handle))
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				+
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle)
			
 
				 {
			
 
				         _STARPU_LOG_IN();
			
--- a/src/core/dependencies/implicit_data_deps.h
+++ b/src/core/dependencies/implicit_data_deps.h
@@ -25,6 +25,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
				 						   starpu_data_handle_t handle, enum starpu_access_mode mode);
			
 
				 void _starpu_detect_implicit_data_deps(struct starpu_task *task);
			
 
				 void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *task, starpu_data_handle_t handle);
			
 
				+void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
			
 
				 
			
 
				 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
			
 
				 void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -163,6 +163,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j, int workerid)
 
				 
			
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				 
			
 
				+	/* Tell other tasks that we don't exist any more, thus no need for
			
 
				+	 * implicit dependencies any more.  */
			
 
				+	_starpu_release_task_enforce_sequential_consistency(j);
			
 
				 	/* Task does not have a cl, but has explicit data dependencies, we need
			
 
				 	 * to tell them that we will not exist any more before notifying the
			
 
				 	 * tasks waiting for us */
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -46,7 +46,7 @@ extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
 
				 extern struct starpu_sched_policy _starpu_sched_eager_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
			
 
				-extern struct starpu_sched_policy heft_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_heft_policy;
			
 
				 
			
 
				 static struct starpu_sched_policy *predefined_policies[] =
			
 
				 {
			
@@ -54,7 +54,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 
				 	&_starpu_sched_prio_policy,
			
 
				 	&_starpu_sched_dm_policy,
			
 
				 	&_starpu_sched_dmda_policy,
			
 
				-	&heft_policy,
			
 
				+	&_starpu_sched_heft_policy,
			
 
				 	&_starpu_sched_dmda_ready_policy,
			
 
				 	&_starpu_sched_dmda_sorted_policy,
			
 
				 	&_starpu_sched_random_policy,
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -559,16 +559,8 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 
				 
			
 
				 	STARPU_ASSERT(handle->busy_count > 0);
			
 
				 	handle->busy_count--;
			
 
				-	_starpu_data_check_not_busy(handle);
			
 
				 
			
 
				-	/* In case there was a temporary handle (eg. used for reduction), this
			
 
				-	 * handle may have requested to be destroyed when the data is released
			
 
				-	 * */
			
 
				-	unsigned handle_was_destroyed = handle->lazy_unregister;
			
 
				-
			
 
				-	_starpu_notify_data_dependencies(handle);
			
 
				-
			
 
				-	if (!handle_was_destroyed)
			
 
				+	if (!_starpu_notify_data_dependencies(handle))
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
@@ -723,15 +715,13 @@ void _starpu_push_task_output(struct _starpu_job *j, uint32_t mask)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, workerid, local_memory_node);
			
 
				 
			
 
				-		/* In case there was a temporary handle (eg. used for
			
 
				-		 * reduction), this handle may have requested to be destroyed
			
 
				-		 * when the data is released
			
 
				-		 * */
			
 
				-		unsigned handle_was_destroyed = handle->lazy_unregister;
			
 
				+		/* Keep a reference for future
			
 
				+		 * _starpu_release_task_enforce_sequential_consistency call */
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				+		handle->busy_count++;
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		_starpu_release_data_on_node(handle, mask, local_replicate);
			
 
				-		if (!handle_was_destroyed)
			
 
				-			_starpu_release_data_enforce_sequential_consistency(task, handle);
			
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -110,7 +110,7 @@ struct _starpu_data_state
 
				 	struct _starpu_spinlock header_lock;
			
 
				 
			
 
				 	/* Condition to make application wait for all transfers before freeing handle */
			
 
				-	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, and number of starpu_data_requesters */
			
 
				+	/* busy_count is the number of handle->refcnt, handle->per_node[*]->refcnt, number of starpu_data_requesters, and number of tasks that have released it but are still registered on the implicit data dependency lists. */
			
 
				 	/* Core code which releases busy_count has to call
			
 
				 	 * _starpu_data_check_not_busy to let starpu_data_unregister proceed */
			
 
				 	unsigned busy_count;
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -296,7 +296,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 		handle->busy_count--;
			
 
				 	}
			
 
				 
			
 
				-	_starpu_data_check_not_busy(handle);
			
 
				+	unsigned destroyed = _starpu_data_check_not_busy(handle);
			
 
				 
			
 
				 	r->refcnt--;
			
 
				 
			
@@ -314,7 +314,8 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	if (do_delete)
			
 
				 		starpu_data_request_destroy(r);
			
 
				 
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				+	if (!destroyed)
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 	/* We do the callback once the lock is released so that they can do
			
 
				 	 * blocking operations with the handle (eg. release it) */
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -405,8 +405,11 @@ struct _starpu_unregister_callback_arg
 
				 
			
 
				 /* Check whether we should tell starpu_data_unregister that the data handle is
			
 
				  * not busy any more.
			
 
				- * The header is supposed to be locked */
			
 
				-void _starpu_data_check_not_busy(starpu_data_handle_t handle)
			
 
				+ * The header is supposed to be locked.
			
 
				+ * This may free the handle, if it was lazily unregistered (1 is returned in
			
 
				+ * that case).  The handle pointer thus becomes invalid for the caller.
			
 
				+ */
			
 
				+int _starpu_data_check_not_busy(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	if (!handle->busy_count && handle->busy_waiting)
			
 
				 	{
			
@@ -414,6 +417,20 @@ void _starpu_data_check_not_busy(starpu_data_handle_t handle)
 
				 		_STARPU_PTHREAD_COND_BROADCAST(&handle->busy_cond);
			
 
				 		_STARPU_PTHREAD_MUTEX_UNLOCK(&handle->busy_mutex);
			
 
				 	}
			
 
				+
			
 
				+	/* The handle has been destroyed in between (eg. this was a temporary
			
 
				+	 * handle created for a reduction.) */
			
 
				+	if (handle->lazy_unregister && handle->busy_count == 0)
			
 
				+	{
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				+		starpu_data_unregister_no_coherency(handle);
			
 
				+		/* Warning: in case we unregister the handle, we must be sure
			
 
				+		 * that the caller will not try to unlock the header after
			
 
				+		 * !*/
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static void _starpu_data_unregister_fetch_data_callback(void *_arg)
			
@@ -519,14 +536,16 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 			func(buffers, NULL);
			
 
				 		}
			
 
				 	}
			
 
				-	else
			
 
				-	{
			
 
				+
			
 
				+	_starpu_spin_lock(&handle->header_lock);
			
 
				+	if (!coherent) {
			
 
				 		/* Should we postpone the unregister operation ? */
			
 
				-		if ((handle->refcnt > 0) && handle->lazy_unregister)
			
 
				+		if ((handle->busy_count > 0) && handle->lazy_unregister) {
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				 			return;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	_starpu_spin_lock(&handle->header_lock);
			
 
				 	/* Tell holders of references that we're starting waiting */
			
 
				 	handle->busy_waiting = 1;
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -27,7 +27,7 @@ void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 
				 	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				 extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				-extern void _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
			
 
				+extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
			
 
				 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -342,7 +342,9 @@ void _starpu_data_end_reduction_mode_terminate(starpu_data_handle_t handle)
 
				 		if (handle->reduction_tmp_handles[worker])
			
 
				 		{
			
 
				 //			fprintf(stderr, "unregister handle %p\n", handle);
			
 
				+			_starpu_spin_lock(&handle->reduction_tmp_handles[worker]->header_lock);
			
 
				 			handle->reduction_tmp_handles[worker]->lazy_unregister = 1;
			
 
				+			_starpu_spin_unlock(&handle->reduction_tmp_handles[worker]->header_lock);
			
 
				 			starpu_data_unregister_no_coherency(handle->reduction_tmp_handles[worker]);
			
 
				 			handle->per_worker[worker].refcnt--;
			
 
				 			/* TODO put in cache */
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -334,8 +334,8 @@ static void _prefetch_data_on_node(void *arg)
 
				 	}
			
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				-	_starpu_notify_data_dependencies(handle);
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				+	if (!_starpu_notify_data_dependencies(handle))
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 static
			
@@ -376,17 +376,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 			STARPU_ASSERT(replicate->refcnt >= 0);
			
 
				 			STARPU_ASSERT(handle->busy_count > 0);
			
 
				 			handle->busy_count--;
			
 
				-			_starpu_data_check_not_busy(handle);
			
 
				 		}
			
 
				 
			
 
				 		/* In case there was a temporary handle (eg. used for reduction), this
			
 
				 		 * handle may have requested to be destroyed when the data is released
			
 
				 		 * */
			
 
				-		unsigned handle_was_destroyed = handle->lazy_unregister;
			
 
				-
			
 
				-		_starpu_notify_data_dependencies(handle);
			
 
				-
			
 
				-		if (!handle_was_destroyed)
			
 
				+		if (!_starpu_notify_data_dependencies(handle))
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 	}
			
 
				 	else if (!async)
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -24,8 +24,8 @@ static void wt_callback(void *arg)
 
				 	starpu_data_handle_t handle = (starpu_data_handle_t) arg;
			
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				-	_starpu_notify_data_dependencies(handle);
			
 
				-	_starpu_spin_unlock(&handle->header_lock);
			
 
				+	if (!_starpu_notify_data_dependencies(handle))
			
 
				+		_starpu_spin_unlock(&handle->header_lock);
			
 
				 }
			
 
				 
			
 
				 void _starpu_write_through_data(starpu_data_handle_t handle, uint32_t requesting_node,
			
--- a/src/sched_policies/heft.c
+++ b/src/sched_policies/heft.c
@@ -641,7 +641,7 @@ static void heft_deinit(unsigned sched_ctx_id)
 
				 	starpu_delete_worker_collection_for_sched_ctx(sched_ctx_id);
			
 
				 }
			
 
				 
			
 
				-struct starpu_sched_policy heft_policy = 
			
 
				+struct starpu_sched_policy _starpu_sched_heft_policy =
			
 
				 {
			
 
				 	.init_sched = heft_init,
			
 
				 	.deinit_sched = heft_deinit,
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -215,7 +215,11 @@ noinst_PROGRAMS =				\
 
				 	parallel_tasks/parallel_kernels		\
			
 
				 	parallel_tasks/parallel_kernels_spmd	\
			
 
				 	perfmodels/regression_based		\
			
 
				-	perfmodels/non_linear_regression_based 
			
 
				+	perfmodels/non_linear_regression_based  \
			
 
				+	sched_policies/data_locality            \
			
 
				+	sched_policies/execute_all_tasks        \
			
 
				+	sched_policies/simple_deps              \
			
 
				+	sched_policies/simple_cpu_gpu_sched
			
 
				 
			
 
				 if STARPU_HAVE_WINDOWS
			
 
				 check_PROGRAMS = $(noinst_PROGRAMS)
			
@@ -543,5 +547,7 @@ perfmodels_non_linear_regression_based_SOURCES+=\
 
				 	perfmodels/opencl_memset.c
			
 
				 endif
			
 
				 
			
 
				+sched_policies_execute_all_tasks_LDFLAGS = -lm
			
 
				+
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
--- a/tests/datawizard/gpu_register.c
+++ b/tests/datawizard/gpu_register.c
@@ -21,6 +21,13 @@
 
				 #include "../helper.h"
			
 
				 #include "scal.h"
			
 
				 
			
 
				+#if ! (defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA))
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+
			
 
				 static int
			
 
				 submit_tasks(starpu_data_handle_t handle, int pieces, int n)
			
 
				 {
			
@@ -292,3 +299,5 @@ fail:
 
				 	starpu_shutdown();
			
 
				 	return EXIT_FAILURE;
			
 
				 }
			
 
				+
			
 
				+#endif /* defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) */
			
--- a/tests/datawizard/readonly.c
+++ b/tests/datawizard/readonly.c
@@ -21,10 +21,12 @@
 
				 #endif
			
 
				 #include "../helper.h"
			
 
				 
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				 static void codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				      FPRINTF(stderr, "codelet\n");
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
--- a/tests/sched_policies/data_locality.c
+++ b/tests/sched_policies/data_locality.c
@@ -0,0 +1,193 @@
 
				+#include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+#define NTASKS 8
			
 
				+
			
 
				+/*
			
 
				+ * It is very inefficient to keep moving data between memory nodes. This
			
 
				+ * test makes sure the scheduler will take account of the data locality
			
 
				+ * when scheduling tasks.
			
 
				+ *
			
 
				+ * Applies to : dmda, heft, pheft.
			
 
				+ */
			
 
				+
			
 
				+static void
			
 
				+dummy(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) buffers;
			
 
				+	(void) args;
			
 
				+}
			
 
				+
			
 
				+/* 
			
 
				+ * Dummy cost function, used to make sure the scheduler does schedule the
			
 
				+ * task, instead of getting rid of it as soon as possible because it doesn't
			
 
				+ * know its expected length.
			
 
				+ */
			
 
				+static double
			
 
				+cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.0;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+	.type          = STARPU_COMMON,
			
 
				+	.cost_function = cost_function
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs     = { dummy, NULL },
			
 
				+	.cuda_funcs    = { dummy, NULL },
			
 
				+	.opencl_funcs  = { dummy, NULL },
			
 
				+	.modes         = { STARPU_RW },
			
 
				+	.model         = &model,
			
 
				+	.nbuffers      = 1
			
 
				+};
			
 
				+
			
 
				+static int var = 42;
			
 
				+static starpu_data_handle_t rw_handle;
			
 
				+
			
 
				+static void
			
 
				+init_data(void)
			
 
				+{
			
 
				+	starpu_variable_data_register(&rw_handle, 0, (uintptr_t) &var,
			
 
				+					sizeof(var));
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+free_data(void)
			
 
				+{
			
 
				+	starpu_data_unregister(rw_handle);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+run(struct starpu_sched_policy *policy)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct starpu_conf conf;
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy = policy;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				+	if (ret == -ENODEV)
			
 
				+		goto enodev;
			
 
				+
			
 
				+	if (starpu_cpu_worker_get_count() == 0 ||
			
 
				+	    (starpu_cuda_worker_get_count() == 0 && 
			
 
				+	     starpu_opencl_worker_get_count() == 0))
			
 
				+		goto enodev;
			
 
				+
			
 
				+	starpu_profiling_status_set(1);
			
 
				+	init_data();
			
 
				+
			
 
				+	/* Send the handle to a GPU. */
			
 
				+	cl.where = STARPU_CUDA | STARPU_OPENCL;
			
 
				+	struct starpu_task *tasks[NTASKS];
			
 
				+	tasks[0] = starpu_task_create();
			
 
				+	tasks[0]->cl = &cl;
			
 
				+	tasks[0]->synchronous = 1;
			
 
				+	tasks[0]->handles[0] = rw_handle;
			
 
				+	tasks[0]->destroy = 0;
			
 
				+	ret = starpu_task_submit(tasks[0]);
			
 
				+	if (ret == -ENODEV)
			
 
				+		goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	
			
 
				+	/* Now, run multiple tasks using this handle. */
			
 
				+	cl.where |= STARPU_CPU;
			
 
				+	int i;
			
 
				+	for (i = 1; i < NTASKS; i++)
			
 
				+	{
			
 
				+		tasks[i] = starpu_task_create();
			
 
				+		tasks[i]->cl = &cl;
			
 
				+		tasks[i]->handles[0] = rw_handle;
			
 
				+		tasks[i]->destroy = 0;
			
 
				+		ret = starpu_task_submit(tasks[i]);
			
 
				+		if (ret == -ENODEV)
			
 
				+			goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	/* All tasks should have been executed on the same GPU. */
			
 
				+	ret = 0;
			
 
				+	unsigned workerid = tasks[0]->profiling_info->workerid;
			
 
				+	for (i = 0; i < NTASKS; i++)
			
 
				+	{
			
 
				+		if (tasks[i]->profiling_info->workerid != workerid)
			
 
				+		{
			
 
				+			ret = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+		starpu_task_destroy(tasks[i]);
			
 
				+	}
			
 
				+
			
 
				+	/* Clean everything up. */
			
 
				+	for (; i < NTASKS; i++)
			
 
				+		starpu_task_destroy(tasks[i]);
			
 
				+
			
 
				+	free_data();
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return -ENODEV;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* XXX: Does this test apply to other schedulers ? */
			
 
				+//extern struct starpu_sched_policy _starpu_sched_ws_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_prio_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_random_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_dm_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_eager_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
			
 
				+//extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_heft_policy;
			
 
				+
			
 
				+static struct starpu_sched_policy *policies[] =
			
 
				+{
			
 
				+	//&_starpu_sched_ws_policy,
			
 
				+	//&_starpu_sched_prio_policy,
			
 
				+	//&_starpu_sched_dm_policy,
			
 
				+	&_starpu_sched_dmda_policy,
			
 
				+	&_starpu_sched_heft_policy,
			
 
				+	//&_starpu_sched_dmda_ready_policy,
			
 
				+	//&_starpu_sched_dmda_sorted_policy,
			
 
				+	//&_starpu_sched_random_policy,
			
 
				+	//&_starpu_sched_eager_policy,
			
 
				+	&_starpu_sched_parallel_heft_policy,
			
 
				+	//&_starpu_sched_pgreedy_policy
			
 
				+};
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	int n_policies = sizeof(policies)/sizeof(policies[0]);
			
 
				+	for (i = 0; i < n_policies; ++i)
			
 
				+	{
			
 
				+		struct starpu_sched_policy *policy = policies[i];
			
 
				+		FPRINTF(stdout, "Running with policy %s.\n",
			
 
				+			policy->policy_name);
			
 
				+		int ret = run(policy);
			
 
				+		if (ret == -ENODEV)
			
 
				+			return STARPU_TEST_SKIPPED;
			
 
				+		if (ret == 1)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tests/sched_policies/execute_all_tasks.c
+++ b/tests/sched_policies/execute_all_tasks.c
@@ -0,0 +1,140 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * All tasks submitted by StarPU should be executed once.
			
 
				+ * Applies to: all schedulers.
			
 
				+ */
			
 
				+
			
 
				+#define NTASKS           8
			
 
				+#define TASK_DURATION    1e6 /* In microseconds */
			
 
				+
			
 
				+extern struct starpu_sched_policy _starpu_sched_ws_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_prio_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_random_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dm_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_eager_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_heft_policy;
			
 
				+
			
 
				+static struct starpu_sched_policy *policies[] =
			
 
				+{
			
 
				+	&_starpu_sched_ws_policy,
			
 
				+	&_starpu_sched_prio_policy,
			
 
				+	&_starpu_sched_dm_policy,
			
 
				+	&_starpu_sched_dmda_policy,
			
 
				+	&_starpu_sched_heft_policy,
			
 
				+	&_starpu_sched_dmda_ready_policy,
			
 
				+	&_starpu_sched_dmda_sorted_policy,
			
 
				+	&_starpu_sched_random_policy,
			
 
				+	&_starpu_sched_eager_policy,
			
 
				+	&_starpu_sched_parallel_heft_policy,
			
 
				+	&_starpu_sched_pgreedy_policy
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+dummy(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) buffers;
			
 
				+	(void) args;
			
 
				+
			
 
				+	usleep(TASK_DURATION);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+run(struct starpu_sched_policy *p)
			
 
				+{
			
 
				+	int ret;
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		exit(STARPU_TEST_SKIPPED);
			
 
				+
			
 
				+	starpu_profiling_status_set(1);
			
 
				+
			
 
				+	struct starpu_task *tasks[NTASKS] = { NULL };
			
 
				+	struct starpu_codelet cl = 
			
 
				+	{
			
 
				+		.cpu_funcs    = {dummy, NULL},
			
 
				+		.cuda_funcs   = {dummy, NULL},
			
 
				+		.opencl_funcs = {dummy, NULL},
			
 
				+		.nbuffers     = 0
			
 
				+	};
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < NTASKS; i++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+		tasks[i] = task;
			
 
				+		task->cl = &cl;
			
 
				+		task->synchronous = 1;
			
 
				+		task->destroy = 0;
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		if (ret != 0)
			
 
				+			return 1;
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	for (i = 0; i < NTASKS; i++)
			
 
				+	{
			
 
				+		struct starpu_task_profiling_info *pi;
			
 
				+		double task_len;
			
 
				+
			
 
				+		pi = tasks[i]->profiling_info;
			
 
				+		task_len = starpu_timing_timespec_delay_us(&pi->start_time, &pi->end_time);
			
 
				+		if (task_len < TASK_DURATION/2)
			
 
				+		{
			
 
				+			FPRINTF(stderr, "Failed with task length: %fµs\n", task_len);
			
 
				+			return 1;
			
 
				+		}
			
 
				+
			
 
				+		starpu_task_destroy(tasks[i]);
			
 
				+	}
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	int n_policies = sizeof(policies)/sizeof(policies[0]);
			
 
				+	for (i = 0; i < n_policies; ++i)
			
 
				+	{
			
 
				+		struct starpu_sched_policy *policy = policies[i];
			
 
				+		FPRINTF(stdout, "Running with policy %s.\n",
			
 
				+			policy->policy_name);
			
 
				+		int ret;
			
 
				+		ret = run(policy);
			
 
				+		if (ret == 1)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tests/sched_policies/simple_cpu_gpu_sched.c
+++ b/tests/sched_policies/simple_cpu_gpu_sched.c
@@ -0,0 +1,242 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * Schedulers that are aware of the expected task length provided by the
			
 
				+ * perfmodels must make sure that :
			
 
				+ * 	- cpu_task is cheduled on a CPU.
			
 
				+ * 	- gpu_task is scheduled on a GPU.
			
 
				+ *
			
 
				+ * Applies to : heft, XXX : and to what other schedulers ?
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+static void
			
 
				+dummy(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) buffers;
			
 
				+	(void) args;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Fake cost functions.
			
 
				+ */
			
 
				+static double
			
 
				+cpu_task_cpu(struct starpu_task *task,
			
 
				+	     enum starpu_perf_archtype arch,
			
 
				+	     unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) arch;
			
 
				+	(void) nimpl;
			
 
				+	return 1.0;
			
 
				+}
			
 
				+
			
 
				+static double
			
 
				+cpu_task_gpu(struct starpu_task *task,
			
 
				+	     enum starpu_perf_archtype arch,
			
 
				+	     unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) arch;
			
 
				+	(void) nimpl;
			
 
				+
			
 
				+	return 1000.0;
			
 
				+}
			
 
				+
			
 
				+static double
			
 
				+gpu_task_cpu(struct starpu_task *task,
			
 
				+	     enum starpu_perf_archtype arch,
			
 
				+	     unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) arch;
			
 
				+	(void) nimpl;
			
 
				+
			
 
				+	return 1000.0;
			
 
				+}
			
 
				+
			
 
				+static double
			
 
				+gpu_task_gpu(struct starpu_task *task,
			
 
				+	     enum starpu_perf_archtype arch,
			
 
				+	     unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) arch;
			
 
				+	(void) nimpl;
			
 
				+
			
 
				+	return 1.0;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel model_cpu_task = 
			
 
				+{
			
 
				+	.type = STARPU_PER_ARCH
			
 
				+};
			
 
				+static struct starpu_perfmodel model_gpu_task = 
			
 
				+{
			
 
				+	.type = STARPU_PER_ARCH
			
 
				+};
			
 
				+
			
 
				+static void
			
 
				+init_perfmodels(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = STARPU_CPU_DEFAULT; i < STARPU_CUDA_DEFAULT; i++)
			
 
				+	{
			
 
				+		model_cpu_task.per_arch[i][0].cost_function = cpu_task_cpu;
			
 
				+		model_gpu_task.per_arch[i][0].cost_function = gpu_task_cpu;
			
 
				+	}
			
 
				+	for (i = STARPU_CUDA_DEFAULT; i < STARPU_GORDON_DEFAULT; i++)
			
 
				+	{
			
 
				+		model_cpu_task.per_arch[i][0].cost_function = cpu_task_gpu;
			
 
				+		model_gpu_task.per_arch[i][0].cost_function = gpu_task_gpu;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Dummy codelets.
			
 
				+ */
			
 
				+static struct starpu_codelet cpu_cl =
			
 
				+{
			
 
				+	.cpu_funcs    = { dummy, NULL },
			
 
				+	.cuda_funcs   = { dummy, NULL },
			
 
				+	.opencl_funcs = { dummy, NULL },
			
 
				+	.nbuffers     = 0,
			
 
				+	.model        = &model_cpu_task
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet gpu_cl =
			
 
				+{
			
 
				+	.cpu_funcs    = { dummy, NULL },
			
 
				+	.cuda_funcs   = { dummy, NULL },
			
 
				+	.opencl_funcs = { dummy, NULL },
			
 
				+	.nbuffers     = 0,
			
 
				+	.model        = &model_gpu_task
			
 
				+};
			
 
				+
			
 
				+static int
			
 
				+run(struct starpu_sched_policy *policy)
			
 
				+{
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy = policy;
			
 
				+	int ret = starpu_init(&conf);
			
 
				+	if (ret == -ENODEV)
			
 
				+		exit(STARPU_TEST_SKIPPED);
			
 
				+
			
 
				+	/* At least 1 CPU and 1 GPU are needed. */
			
 
				+	if (starpu_cpu_worker_get_count() == 0)
			
 
				+		exit(STARPU_TEST_SKIPPED);
			
 
				+	if (starpu_cuda_worker_get_count() == 0 &&
			
 
				+	    starpu_opencl_worker_get_count() == 0)
			
 
				+		exit(STARPU_TEST_SKIPPED);
			
 
				+
			
 
				+	starpu_profiling_status_set(1);
			
 
				+	init_perfmodels();
			
 
				+
			
 
				+	struct starpu_task *cpu_task = starpu_task_create();
			
 
				+	cpu_task->cl = &cpu_cl;
			
 
				+	cpu_task->destroy = 0;
			
 
				+
			
 
				+	struct starpu_task *gpu_task = starpu_task_create();
			
 
				+	gpu_task->cl = &gpu_cl;
			
 
				+	gpu_task->destroy = 0;
			
 
				+
			
 
				+	ret = starpu_task_submit(cpu_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	ret = starpu_task_submit(gpu_task);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	enum starpu_archtype cpu_task_worker, gpu_task_worker;
			
 
				+	cpu_task_worker = starpu_worker_get_type(cpu_task->profiling_info->workerid);
			
 
				+	gpu_task_worker = starpu_worker_get_type(gpu_task->profiling_info->workerid);
			
 
				+	if (cpu_task_worker != STARPU_CPU_WORKER ||
			
 
				+	    (gpu_task_worker != STARPU_CUDA_WORKER &&
			
 
				+	     gpu_task_worker != STARPU_OPENCL_WORKER))
			
 
				+		ret = 1;
			
 
				+	else
			
 
				+		ret = 0;
			
 
				+
			
 
				+
			
 
				+	starpu_task_destroy(cpu_task);
			
 
				+	starpu_task_destroy(gpu_task);
			
 
				+	starpu_shutdown();
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+extern struct starpu_sched_policy _starpu_sched_ws_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_prio_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_random_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dm_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_eager_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
			
 
				+*/
			
 
				+extern struct starpu_sched_policy _starpu_sched_heft_policy;
			
 
				+
			
 
				+/* XXX: what policies are we interested in ? */
			
 
				+static struct starpu_sched_policy *policies[] =
			
 
				+{
			
 
				+	//&_starpu_sched_ws_policy,
			
 
				+	//&_starpu_sched_prio_policy,
			
 
				+	//&_starpu_sched_dm_policy,
			
 
				+	//&_starpu_sched_dmda_policy,
			
 
				+	&_starpu_sched_heft_policy,
			
 
				+	//&_starpu_sched_dmda_ready_policy,
			
 
				+	//&_starpu_sched_dmda_sorted_policy,
			
 
				+	//&_starpu_sched_random_policy,
			
 
				+	//&_starpu_sched_eager_policy,
			
 
				+	//&_starpu_sched_parallel_heft_policy,
			
 
				+	//&_starpu_sched_pgreedy_policy
			
 
				+};
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+#ifndef STARPU_HAVE_SETENV
			
 
				+/* XXX: is this macro used by all the schedulers we are interested in ? */
			
 
				+#warning "setenv() is not available, skipping this test"
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+#else
			
 
				+	setenv("STARPU_SCHED_BETA", "0", 1);
			
 
				+
			
 
				+	int i;
			
 
				+	int n_policies = sizeof(policies)/sizeof(policies[0]);
			
 
				+	for (i = 0; i < n_policies; ++i)
			
 
				+	{
			
 
				+		struct starpu_sched_policy *policy = policies[i];
			
 
				+		FPRINTF(stdout, "Running with policy %s.\n",
			
 
				+			policy->policy_name);
			
 
				+		int ret;
			
 
				+		ret = run(policy);
			
 
				+		if (ret == 1)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+#endif
			
 
				+}
			
--- a/tests/sched_policies/simple_deps.c
+++ b/tests/sched_policies/simple_deps.c
@@ -0,0 +1,136 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012 Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <unistd.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_profiling.h>
			
 
				+
			
 
				+
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+/*
			
 
				+ * Task1 must be executed before task0, even if task0 is submitted first.
			
 
				+ * Applies to : all schedulers.
			
 
				+ */
			
 
				+
			
 
				+static void
			
 
				+dummy(void *buffers[], void *args)
			
 
				+{
			
 
				+	(void) buffers;
			
 
				+	(void) args;
			
 
				+	usleep(1000000);
			
 
				+}
			
 
				+
			
 
				+static int
			
 
				+run(struct starpu_sched_policy *policy)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy = policy;
			
 
				+	ret = starpu_init(&conf);
			
 
				+	if (ret != 0)
			
 
				+		exit(STARPU_TEST_SKIPPED);
			
 
				+	starpu_profiling_status_set(1);
			
 
				+
			
 
				+	struct starpu_codelet cl =
			
 
				+	{
			
 
				+		.cpu_funcs = {dummy, NULL},
			
 
				+		.nbuffers = 0
			
 
				+	};
			
 
				+
			
 
				+	struct starpu_task *task0 = starpu_task_create();
			
 
				+	task0->cl = &cl;
			
 
				+	task0->destroy = 0;
			
 
				+
			
 
				+	struct starpu_task *task1 = starpu_task_create();
			
 
				+	task1->cl = &cl;
			
 
				+	task1->destroy = 0;
			
 
				+
			
 
				+	starpu_task_declare_deps_array(task0, 1, &task1);
			
 
				+
			
 
				+	ret = starpu_task_submit(task0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	ret = starpu_task_submit(task1);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	double t1, t2;
			
 
				+	t1 = starpu_timing_timespec_to_us(&task1->profiling_info->end_time);
			
 
				+	t2 = starpu_timing_timespec_to_us(&task0->profiling_info->start_time);
			
 
				+
			
 
				+	starpu_task_destroy(task0);
			
 
				+	starpu_task_destroy(task1);
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return t1 < t2 ? 0:1;
			
 
				+
			
 
				+enodev:
			
 
				+	starpu_shutdown();
			
 
				+	return -ENODEV;
			
 
				+}
			
 
				+
			
 
				+extern struct starpu_sched_policy _starpu_sched_ws_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_prio_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_random_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dm_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_ready_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_dmda_sorted_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_eager_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_parallel_heft_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_pgreedy_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_heft_policy;
			
 
				+
			
 
				+static struct starpu_sched_policy *policies[] =
			
 
				+{
			
 
				+	&_starpu_sched_ws_policy,
			
 
				+	&_starpu_sched_prio_policy,
			
 
				+	&_starpu_sched_dm_policy,
			
 
				+	&_starpu_sched_dmda_policy,
			
 
				+	&_starpu_sched_heft_policy,
			
 
				+	&_starpu_sched_dmda_ready_policy,
			
 
				+	&_starpu_sched_dmda_sorted_policy,
			
 
				+	&_starpu_sched_random_policy,
			
 
				+	&_starpu_sched_eager_policy,
			
 
				+	&_starpu_sched_parallel_heft_policy,
			
 
				+	&_starpu_sched_pgreedy_policy
			
 
				+};
			
 
				+
			
 
				+int
			
 
				+main(void)
			
 
				+{
			
 
				+	int i;
			
 
				+	int n_policies = sizeof(policies)/sizeof(policies[0]);
			
 
				+	for (i = 0; i < n_policies; ++i)
			
 
				+	{
			
 
				+		struct starpu_sched_policy *policy = policies[i];
			
 
				+		FPRINTF(stdout, "Running with policy %s.\n",
			
 
				+			policy->policy_name);
			
 
				+		int ret;
			
 
				+		ret = run(policy);
			
 
				+		if (ret == -ENODEV)
			
 
				+			return STARPU_TEST_SKIPPED;
			
 
				+		if (ret == 1)
			
 
				+			return EXIT_FAILURE;
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tools/dev/internal/rename_internal.sed
+++ b/tools/dev/internal/rename_internal.sed
@@ -14,6 +14,7 @@
 
				 #
			
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				+s/\bheft_policy\b/_starpu_sched_heft_policy/g
			
 
				 s/\bstruct starpu_priority_taskq_s\b/struct _starpu_priority_taskq/g
			
 
				 s/\bSTARPU_FUT_APPS_KEY\b/_STARPU_FUT_APPS_KEY/g
			
 
				 s/\bSTARPU_FUT_CPU_KEY\b/_STARPU_FUT_CPU_KEY/g