13 years ago · 482baecdf7
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,13 +65,15 @@ if BUILD_STARPU_TOP
 
																 all-local:
															
 
																 	cd starpu-top ; $(QMAKE) ; $(MAKE)
															
 
																 clean-local:
															
 
																-	cd starpu-top ; $(MAKE) clean
															
 
																+	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
															
 
																 # TODO: resources
															
 
																 install-exec-local:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(bindir)
															
 
																 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
															
 
																 uninstall-local:
															
 
																 	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
															
 
																+	$(RM) starpu-top/StarPU-Top
															
 
																+	$(RM) starpu-top/Makefile
															
 
																 endif
															
 
																 if STARPU_HAVE_WINDOWS
															
--- a/configure.ac
+++ b/configure.ac
@@ -594,9 +594,11 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 
																     	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
															
 
																         if test "$have_valid_opencl" = "no" ; then
															
 
																             for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
															
 
																-    	        STARPU_CHECK_OPENCL($f, "no", "no")
															
 
																-                if test "$have_valid_opencl" = "yes" ; then
															
 
																-                    break
															
 
																+                if test -n $f ; then
															
 
																+    	            STARPU_CHECK_OPENCL($f, "no", "no")
															
 
																+                    if test "$have_valid_opencl" = "yes" ; then
															
 
																+                        break
															
 
																+                    fi
															
 
																                 fi
															
 
																             done
															
 
																         fi
															
@@ -1389,24 +1391,11 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
																 # File configuration
															
 
																 AC_CONFIG_COMMANDS([executable-scripts], [
															
 
																   chmod +x tests/regression/regression.sh
															
 
																+  chmod +x gcc-plugin/tests/run-test
															
 
																 ])
															
 
																 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
															
 
																-AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
															
 
																-
															
 
																-if test $build_gcc_plugin == "yes" ; then
															
 
																-    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
															
 
																-    AC_OUTPUT([
															
 
																-	    gcc-plugin/Makefile
															
 
																-	    gcc-plugin/src/Makefile
															
 
																-	    gcc-plugin/tests/Makefile
															
 
																-	    gcc-plugin/tests/run-test
															
 
																-	    gcc-plugin/examples/Makefile
															
 
																-            ])
															
 
																-    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
															
 
																-            chmod +x gcc-plugin/tests/run-test
															
 
																-            ])
															
 
																-fi 
															
 
																+AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/src/starpu-gcc-config.h)
															
 
																 AC_OUTPUT([
															
 
																 	Makefile
															
@@ -1425,6 +1414,11 @@ AC_OUTPUT([
 
																 	doc/Makefile
															
 
																 	mpi/Makefile
															
 
																 	starpu-top/StarPU-Top.pro
															
 
																+        gcc-plugin/Makefile
															
 
																+	gcc-plugin/src/Makefile
															
 
																+	gcc-plugin/tests/Makefile
															
 
																+	gcc-plugin/tests/run-test
															
 
																+	gcc-plugin/examples/Makefile
															
 
																 ])
															
 
																 AC_MSG_NOTICE([
															
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -409,7 +409,7 @@ configuration:
 
																 * Setting flags for compiling and linking applications::  
															
 
																 * Running a basic StarPU application::  
															
 
																 * Kernel threads started by StarPU::
															
 
																-* Using accelerators::          
															
 
																+* Enabling OpenCL::
															
 
																 @end menu
															
 
																 @node Setting flags for compiling and linking applications
															
@@ -462,14 +462,20 @@ installed. This step is done only once per user and per machine.
 
																 @node Kernel threads started by StarPU
															
 
																 @section Kernel threads started by StarPU
															
 
																-TODO: StarPU starts one thread per CPU core and binds them there, uses one of
															
 
																-them per GPU. The application is not supposed to do computations in its own
															
 
																-threads. TODO: add a StarPU function to bind an application thread (e.g. the
															
 
																-main thread) to a dedicated core (and thus disable the corresponding StarPU CPU
															
 
																-worker).
															
 
																+StarPU automatically binds one thread per CPU core. It does not use
															
 
																+SMT/hyperthreading because kernels are usually already optimized for using a
															
 
																+full core, and using hyperthreading would make kernel calibration rather random.
															
 
																-@node Using accelerators
															
 
																-@section Using accelerators
															
 
																+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
															
 
																+
															
 
																+While StarPU tasks are executing, the application is not supposed to do
															
 
																+computations in the threads it starts itself, tasks should be used instead.
															
 
																+
															
 
																+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
															
 
																+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
															
 
																+
															
 
																+@node Enabling OpenCL
															
 
																+@section Enabling OpenCL
															
 
																 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
															
 
																 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
															
@@ -477,8 +483,26 @@ This design choice was necessary as OpenCL and CUDA can not run at the
 
																 same time on the same NVIDIA GPU, as there is currently no interoperability
															
 
																 between them.
															
 
																-Details on how to specify devices running OpenCL and the ones running
															
 
																-CUDA are given in @ref{Enabling OpenCL}.
															
 
																+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
															
 
																+
															
 
																+@example
															
 
																+% ./configure --disable-cuda
															
 
																+@end example
															
 
																+
															
 
																+or when running applications:
															
 
																+
															
 
																+@example
															
 
																+% STARPU_NCUDA=0 ./application
															
 
																+@end example
															
 
																+
															
 
																+OpenCL will automatically be started on any device not yet used by
															
 
																+CUDA. So on a machine running 4 GPUS, it is therefore possible to
															
 
																+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
															
 
																+so:
															
 
																+
															
 
																+@example
															
 
																+% STARPU_NCUDA=2 ./application
															
 
																+@end example
															
 
																 @c ---------------------------------------------------------------------
															
@@ -1110,7 +1134,7 @@ or for example, by disabling CPU devices:
 
																 @end smallexample
															
 
																 or by disabling CUDA devices (which may permit to enable the use of OpenCL,
															
 
																-see @ref{Using accelerators}):
															
 
																+see @ref{Enabling OpenCL}):
															
 
																 @smallexample
															
 
																 % STARPU_NCUDA=0 ./vector_scal
															
@@ -2153,7 +2177,9 @@ very natural way by the means of asynchronous interactions between the
 
																 application and StarPU.  This is implemented in a separate libstarpumpi library
															
 
																 which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
															
 
																 @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
															
 
																-GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.
															
 
																+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
															
 
																+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
															
 
																+the different MPI nodes.
															
 
																 @menu
															
 
																 * The API::                     
															
@@ -3909,11 +3935,13 @@ This partitions a vector into blocks of the same size.
 
																 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
															
 
																-This partitions a vector into blocks of sizes given in @var{filter_arg_ptr}.
															
 
																+This partitions a vector into blocks of sizes given in the @var{filter_arg_ptr}
															
 
																+field of @var{f}, supposed to point on a @code{uint32_t*} array.
															
 
																 @end deftypefun
															
 
																 @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
															
 
																-This partitions a vector into two blocks, the first block size being given in @var{filter_arg}.
															
 
																+This partitions a vector into two blocks, the first block size being given in
															
 
																+the @var{filter_arg} field of @var{f}.
															
 
																 @end deftypefun
															
@@ -4601,38 +4629,11 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 
																 @section OpenCL extensions
															
 
																 @menu
															
 
																-* Enabling OpenCL::            Enabling OpenCL
															
 
																 * Compiling OpenCL kernels::   Compiling OpenCL kernels
															
 
																 * Loading OpenCL kernels::     Loading OpenCL kernels
															
 
																 * OpenCL statistics::          Collecting statistics from OpenCL
															
 
																 @end menu
															
 
																-@node Enabling OpenCL
															
 
																-@subsection Enabling OpenCL
															
 
																-
															
 
																-On GPU devices which can run both CUDA and OpenCL, CUDA will be
															
 
																-enabled by default. To enable OpenCL, you need either to disable CUDA
															
 
																-when configuring StarPU:
															
 
																-
															
 
																-@example
															
 
																-% ./configure --disable-cuda
															
 
																-@end example
															
 
																-
															
 
																-or when running applications:
															
 
																-
															
 
																-@example
															
 
																-% STARPU_NCUDA=0 ./application
															
 
																-@end example
															
 
																-
															
 
																-OpenCL will automatically be started on any device not yet used by
															
 
																-CUDA. So on a machine running 4 GPUS, it is therefore possible to
															
 
																-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
															
 
																-so:
															
 
																-
															
 
																-@example
															
 
																-% STARPU_NCUDA=2 ./application
															
 
																-@end example
															
 
																-
															
 
																 @node Compiling OpenCL kernels
															
 
																 @subsection Compiling OpenCL kernels
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -112,6 +112,9 @@ noinst_HEADERS = 				\
 
																 	lu/xlu_kernels.h			\
															
 
																 	lu/float.h				\
															
 
																 	lu/double.h				\
															
 
																+	lu/complex_float.h			\
															
 
																+	lu/complex_double.h			\
															
 
																+	lu/blas_complex.h			\
															
 
																 	cholesky/cholesky.h			\
															
 
																 	common/blas_model.h			\
															
 
																 	common/blas.h				\
															
@@ -195,6 +198,14 @@ examplebin_PROGRAMS +=				\
 
																 	cg/cg
															
 
																 endif
															
 
																+if MKL_BLAS_LIB
															
 
																+examplebin_PROGRAMS +=				\
															
 
																+	lu/lu_example_complex_float		\
															
 
																+	lu/lu_example_complex_double		\
															
 
																+	lu/lu_implicit_example_complex_float	\
															
 
																+	lu/lu_implicit_example_complex_double
															
 
																+endif
															
 
																+
															
 
																 if ATLAS_BLAS_LIB
															
 
																 examplebin_PROGRAMS +=				\
															
 
																 	spmv/dw_block_spmv
															
@@ -244,6 +255,14 @@ STARPU_EXAMPLES +=				\
 
																 	cg/cg
															
 
																 endif
															
 
																+if MKL_BLAS_LIB
															
 
																+STARPU_EXAMPLES +=				\
															
 
																+	lu/lu_example_complex_float		\
															
 
																+	lu/lu_example_complex_double		\
															
 
																+	lu/lu_implicit_example_complex_float	\
															
 
																+	lu/lu_implicit_example_complex_double
															
 
																+endif
															
 
																+
															
 
																 if ATLAS_BLAS_LIB
															
 
																 STARPU_EXAMPLES +=				\
															
 
																 	spmv/dw_block_spmv
															
@@ -466,8 +485,56 @@ lu_lu_implicit_example_double_SOURCES =		\
 
																 lu_lu_implicit_example_double_LDADD =		\
															
 
																 	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+if MKL_BLAS_LIB
															
 
																+lu_lu_example_complex_float_SOURCES =		\
															
 
																+	lu/lu_example_complex_float.c		\
															
 
																+	lu/clu.c				\
															
 
																+	lu/clu_pivot.c				\
															
 
																+	lu/clu_kernels.c			\
															
 
																+	lu/blas_complex.c			\
															
 
																+	common/blas.c
															
 
																+
															
 
																+lu_lu_example_complex_float_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+lu_lu_implicit_example_complex_float_SOURCES =	\
															
 
																+	lu/lu_example_complex_float.c		\
															
 
																+	lu/clu_implicit.c			\
															
 
																+	lu/clu_implicit_pivot.c			\
															
 
																+	lu/clu_kernels.c			\
															
 
																+	lu/blas_complex.c			\
															
 
																+	common/blas.c
															
 
																+
															
 
																+lu_lu_implicit_example_complex_float_LDADD =	\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+lu_lu_example_complex_double_SOURCES =		\
															
 
																+	lu/lu_example_complex_double.c		\
															
 
																+	lu/zlu.c				\
															
 
																+	lu/zlu_pivot.c				\
															
 
																+	lu/zlu_kernels.c			\
															
 
																+	lu/blas_complex.c			\
															
 
																+	common/blas.c
															
 
																+
															
 
																+lu_lu_example_complex_double_LDADD =		\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+lu_lu_implicit_example_complex_double_SOURCES =	\
															
 
																+	lu/lu_example_complex_double.c		\
															
 
																+	lu/zlu_implicit.c			\
															
 
																+	lu/zlu_implicit_pivot.c			\
															
 
																+	lu/zlu_kernels.c			\
															
 
																+	lu/blas_complex.c			\
															
 
																+	common/blas.c
															
 
																+
															
 
																+lu_lu_implicit_example_complex_double_LDADD =	\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+endif
															
 
																 endif
															
 
																+
															
 
																 ###########################
															
 
																 # 2 Cholesky in 2 ctxs  example #
															
 
																 ###########################
															
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -122,5 +122,5 @@ int main(int argc, char **argv)
 
																         starpu_shutdown();
															
 
																-	return 0;
															
 
																+	return (ret!=1);
															
 
																 }
															
--- a/examples/lu/blas_complex.c
+++ b/examples/lu/blas_complex.c
@@ -0,0 +1,214 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <ctype.h>
															
 
																+#include <stdio.h>
															
 
																+#include <complex.h>
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include "blas_complex.h"
															
 
																+
															
 
																+/*
															
 
																+    This files contains BLAS wrappers for the different BLAS implementations
															
 
																+  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
															
 
																+  libraries do not supply C-based ordering.
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_ATLAS
															
 
																+#error not implemented
															
 
																+#elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
															
 
																+#error not implemented
															
 
																+#elif defined(STARPU_MKL)
															
 
																+
															
 
																+inline void CGEMM(char *transa, char *transb, int M, int N, int K, 
															
 
																+			complex float alpha, complex float *A, int lda, complex float *B, int ldb, 
															
 
																+			complex float beta, complex float *C, int ldc)
															
 
																+{
															
 
																+	cgemm_(transa, transb, &M, &N, &K, &alpha,
															
 
																+			 A, &lda, B, &ldb,
															
 
																+			 &beta, C, &ldc);	
															
 
																+}
															
 
																+
															
 
																+inline void ZGEMM(char *transa, char *transb, int M, int N, int K, 
															
 
																+			complex double alpha, complex double *A, int lda, complex double *B, int ldb, 
															
 
																+			complex double beta, complex double *C, int ldc)
															
 
																+{
															
 
																+	zgemm_(transa, transb, &M, &N, &K, &alpha,
															
 
																+			 A, &lda, B, &ldb,
															
 
																+			 &beta, C, &ldc);	
															
 
																+}
															
 
																+
															
 
																+
															
 
																+inline void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
															
 
																+		complex float *X, int incX, complex float beta, complex float *Y, int incY)
															
 
																+{
															
 
																+	cgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+inline void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
															
 
																+		complex double *X, int incX, complex double beta, complex double *Y, int incY)
															
 
																+{
															
 
																+	zgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+inline float SCASUM(int N, complex float *X, int incX)
															
 
																+{
															
 
																+	return scasum_(&N, X, &incX);
															
 
																+}
															
 
																+
															
 
																+inline double DZASUM(int N, complex double *X, int incX)
															
 
																+{
															
 
																+	return dzasum_(&N, X, &incX);
															
 
																+}
															
 
																+
															
 
																+void CSCAL(int N, complex float alpha, complex float *X, int incX)
															
 
																+{
															
 
																+	cscal_(&N, &alpha, X, &incX);
															
 
																+}
															
 
																+
															
 
																+void ZSCAL(int N, complex double alpha, complex double *X, int incX)
															
 
																+{
															
 
																+	zscal_(&N, &alpha, X, &incX);
															
 
																+}
															
 
																+
															
 
																+void CTRSM (const char *side, const char *uplo, const char *transa,
															
 
																+                   const char *diag, const int m, const int n,
															
 
																+                   const complex float alpha, const complex float *A, const int lda,
															
 
																+                   complex float *B, const int ldb)
															
 
																+{
															
 
																+	ctrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
															
 
																+}
															
 
																+
															
 
																+void ZTRSM (const char *side, const char *uplo, const char *transa,
															
 
																+                   const char *diag, const int m, const int n,
															
 
																+                   const complex double alpha, const complex double *A, const int lda,
															
 
																+                   complex double *B, const int ldb)
															
 
																+{
															
 
																+	ztrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
															
 
																+}
															
 
																+
															
 
																+void CSYR (const char *uplo, const int n, const complex float alpha,
															
 
																+                  const complex float *x, const int incx, complex float *A, const int lda)
															
 
																+{
															
 
																+	csyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
															
 
																+}
															
 
																+
															
 
																+void CSYRK (const char *uplo, const char *trans, const int n,
															
 
																+                   const int k, const complex float alpha, const complex float *A,
															
 
																+                   const int lda, const complex float beta, complex float *C,
															
 
																+                   const int ldc)
															
 
																+{
															
 
																+	csyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
															
 
																+}
															
 
																+
															
 
																+void CGERU(const int m, const int n, const complex float alpha,
															
 
																+                  const complex float *x, const int incx, const complex float *y,
															
 
																+                  const int incy, complex float *A, const int lda)
															
 
																+{
															
 
																+	cgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
															
 
																+}
															
 
																+
															
 
																+void ZGERU(const int m, const int n, const complex double alpha,
															
 
																+                  const complex double *x, const int incx, const complex double *y,
															
 
																+                  const int incy, complex double *A, const int lda)
															
 
																+{
															
 
																+	zgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
															
 
																+}
															
 
																+
															
 
																+void CTRSV (const char *uplo, const char *trans, const char *diag, 
															
 
																+                   const int n, const complex float *A, const int lda, complex float *x, 
															
 
																+                   const int incx)
															
 
																+{
															
 
																+	ctrsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
															
 
																+}
															
 
																+
															
 
																+void CTRMM(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int m, const int n,
															
 
																+                 const complex float alpha, const complex float *A, const int lda,
															
 
																+                 complex float *B, const int ldb)
															
 
																+{
															
 
																+	ctrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
															
 
																+}
															
 
																+
															
 
																+void ZTRMM(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int m, const int n,
															
 
																+                 const complex double alpha, const complex double *A, const int lda,
															
 
																+                 complex double *B, const int ldb)
															
 
																+{
															
 
																+	ztrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
															
 
																+}
															
 
																+
															
 
																+void CTRMV(const char *uplo, const char *transA, const char *diag,
															
 
																+                 const int n, const complex float *A, const int lda, complex float *X,
															
 
																+                 const int incX)
															
 
																+{
															
 
																+	ctrmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
															
 
																+}
															
 
																+
															
 
																+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incY)
															
 
																+{
															
 
																+	caxpy_(&n, &alpha, X, &incX, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY)
															
 
																+{
															
 
																+	zaxpy_(&n, &alpha, X, &incX, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+int ICAMAX (const int n, complex float *X, const int incX)
															
 
																+{
															
 
																+    int retVal;
															
 
																+    retVal = icamax_ (&n, X, &incX);
															
 
																+    return retVal;
															
 
																+}
															
 
																+
															
 
																+int IZAMAX (const int n, complex double *X, const int incX)
															
 
																+{
															
 
																+    int retVal;
															
 
																+    retVal = izamax_ (&n, X, &incX);
															
 
																+    return retVal;
															
 
																+}
															
 
																+
															
 
																+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy)
															
 
																+{
															
 
																+	complex float retVal = 0;
															
 
																+
															
 
																+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
															
 
																+	retVal = (float)cdotu_(&n, x, &incx, y, &incy);
															
 
																+
															
 
																+	return retVal;
															
 
																+}
															
 
																+
															
 
																+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy)
															
 
																+{
															
 
																+	return zdotu_(&n, x, &incx, y, &incy);
															
 
																+}
															
 
																+
															
 
																+void CSWAP(const int n, complex float *X, const int incX, complex float *Y, const int incY)
															
 
																+{
															
 
																+	cswap_(&n, X, &incX, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+void ZSWAP(const int n, complex double *X, const int incX, complex double *Y, const int incY)
															
 
																+{
															
 
																+	zswap_(&n, X, &incX, Y, &incY);
															
 
																+}
															
 
																+
															
 
																+
															
 
																+#else
															
 
																+#error "no BLAS lib available..."
															
 
																+#endif
															
--- a/examples/lu/blas_complex.h
+++ b/examples/lu/blas_complex.h
@@ -0,0 +1,156 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#ifndef __BLAS_H__
															
 
																+#define __BLAS_H__
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#if defined(STARPU_MKL)
															
 
																+#define MKLcomplex8 complex float
															
 
																+#define MKLcomplex16 complex double
															
 
																+#endif
															
 
																+
															
 
																+void CGEMM(char *transa, char *transb, int M, int N, int K, complex float alpha, complex float *A, int lda, 
															
 
																+		complex float *B, int ldb, complex float beta, complex float *C, int ldc);
															
 
																+void ZGEMM(char *transa, char *transb, int M, int N, int K, complex double alpha, complex double *A, int lda, 
															
 
																+		complex double *B, int ldb, complex double beta, complex double *C, int ldc);
															
 
																+void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
															
 
																+		complex float *X, int incX, complex float beta, complex float *Y, int incY);
															
 
																+void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
															
 
																+		complex double *X, int incX, complex double beta, complex double *Y, int incY);
															
 
																+float SCASUM(int N, complex float *X, int incX);
															
 
																+double DZASUM(int N, complex double *X, int incX);
															
 
																+void CSCAL(int N, complex float alpha, complex float *X, int incX);
															
 
																+void ZSCAL(int N, complex double alpha, complex double *X, int incX);
															
 
																+void CTRSM (const char *side, const char *uplo, const char *transa,
															
 
																+                   const char *diag, const int m, const int n,
															
 
																+                   const complex float alpha, const complex float *A, const int lda,
															
 
																+                   complex float *B, const int ldb);
															
 
																+void ZTRSM (const char *side, const char *uplo, const char *transa,
															
 
																+                   const char *diag, const int m, const int n,
															
 
																+                   const complex double alpha, const complex double *A, const int lda,
															
 
																+                   complex double *B, const int ldb);
															
 
																+void CSYR (const char *uplo, const int n, const complex float alpha,
															
 
																+                  const complex float *x, const int incx, complex float *A, const int lda);
															
 
																+void CSYRK (const char *uplo, const char *trans, const int n,
															
 
																+                   const int k, const complex float alpha, const complex float *A,
															
 
																+                   const int lda, const complex float beta, complex float *C,
															
 
																+                   const int ldc);
															
 
																+void CGERU (const int m, const int n, const complex float alpha,
															
 
																+                  const complex float *x, const int incx, const complex float *y,
															
 
																+                  const int incy, complex float *A, const int lda);
															
 
																+void ZGERU(const int m, const int n, const complex double alpha,
															
 
																+                  const complex double *x, const int incx, const complex double *y,
															
 
																+                  const int incy, complex double *A, const int lda);
															
 
																+void CTRSV (const char *uplo, const char *trans, const char *diag, 
															
 
																+                   const int n, const complex float *A, const int lda, complex float *x, 
															
 
																+                   const int incx);
															
 
																+void CTRMM(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int m, const int n,
															
 
																+                 const complex float alpha, const complex float *A, const int lda,
															
 
																+                 complex float *B, const int ldb);
															
 
																+void ZTRMM(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int m, const int n,
															
 
																+                 const complex double alpha, const complex double *A, const int lda,
															
 
																+                 complex double *B, const int ldb);
															
 
																+void CTRMV(const char *uplo, const char *transA, const char *diag,
															
 
																+                 const int n, const complex float *A, const int lda, complex float *X,
															
 
																+                 const int incX);
															
 
																+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incy);
															
 
																+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY);
															
 
																+int ICAMAX (const int n, complex float *X, const int incX);
															
 
																+int IZAMAX (const int n, complex double *X, const int incX);
															
 
																+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy);
															
 
																+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy);
															
 
																+void CSWAP(const int n, complex float *x, const int incx, complex float *y, const int incy);
															
 
																+void ZSWAP(const int n, complex double *x, const int incx, complex double *y, const int incy);
															
 
																+
															
 
																+#if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
															
 
																+#error not implemented
															
 
																+#elif defined(STARPU_MKL)
															
 
																+
															
 
																+extern void cgemm_ (const char *transa, const char *transb, const int *m,
															
 
																+                   const int *n, const int *k, const complex float *alpha, 
															
 
																+                   const complex float *A, const int *lda, const complex float *B, 
															
 
																+                   const int *ldb, const complex float *beta, complex float *C, 
															
 
																+                   const int *ldc);
															
 
																+extern void zgemm_ (const char *transa, const char *transb, const int *m,
															
 
																+                   const int *n, const int *k, const complex double *alpha, 
															
 
																+                   const complex double *A, const int *lda, const complex double *B, 
															
 
																+                   const int *ldb, const complex double *beta, complex double *C, 
															
 
																+                   const int *ldc);
															
 
																+extern void cgemv_(const char *trans, int *m, int *n, complex float *alpha,
															
 
																+                   void *a, int *lda, void *x, int *incx, 
															
 
																+                   complex float *beta, void *y, int *incy);
															
 
																+extern void zgemv_(const char *trans, int *m, int *n, complex double *alpha,
															
 
																+                   void *a, int *lda, void *x, int *incx,
															
 
																+                   complex double *beta, void *y, int *incy);
															
 
																+extern void csyr_ (const char *uplo, const int *n, const complex float *alpha,
															
 
																+                  const complex float *x, const int *incx, complex float *A, const int *lda);
															
 
																+extern void csyrk_ (const char *uplo, const char *trans, const int *n,
															
 
																+                   const int *k, const complex float *alpha, const complex float *A,
															
 
																+                   const int *lda, const complex float *beta, complex float *C,
															
 
																+                   const int *ldc);
															
 
																+extern void ctrsm_ (const char *side, const char *uplo, const char *transa, 
															
 
																+                   const char *diag, const int *m, const int *n,
															
 
																+                   const complex float *alpha, const complex float *A, const int *lda,
															
 
																+                   complex float *B, const int *ldb);
															
 
																+extern void ztrsm_ (const char *side, const char *uplo, const char *transa, 
															
 
																+                   const char *diag, const int *m, const int *n,
															
 
																+                   const complex double *alpha, const complex double *A, const int *lda,
															
 
																+                   complex double *B, const int *ldb);
															
 
																+extern complex double scasum_ (const int *n, const complex float *x, const int *incx);
															
 
																+extern complex double dzasum_ (const int *n, const complex double *x, const int *incx);
															
 
																+extern void cscal_ (const int *n, const complex float *alpha, complex float *x,
															
 
																+                   const int *incx);
															
 
																+extern void zscal_ (const int *n, const complex double *alpha, complex double *x,
															
 
																+                   const int *incx);
															
 
																+extern void cgeru_(const int *m, const int *n, const complex float *alpha,
															
 
																+                  const complex float *x, const int *incx, const complex float *y,
															
 
																+                  const int *incy, complex float *A, const int *lda);
															
 
																+extern void zgeru_(const int *m, const int *n, const complex double *alpha,
															
 
																+                  const complex double *x, const int *incx, const complex double *y,
															
 
																+                  const int *incy, complex double *A, const int *lda);
															
 
																+extern void ctrsv_ (const char *uplo, const char *trans, const char *diag, 
															
 
																+                   const int *n, const complex float *A, const int *lda, complex float *x, 
															
 
																+                   const int *incx);
															
 
																+extern void ctrmm_(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int *m, const int *n,
															
 
																+                 const complex float *alpha, const complex float *A, const int *lda,
															
 
																+                 complex float *B, const int *ldb);
															
 
																+extern void ztrmm_(const char *side, const char *uplo, const char *transA,
															
 
																+                 const char *diag, const int *m, const int *n,
															
 
																+                 const complex double *alpha, const complex double *A, const int *lda,
															
 
																+                 complex double *B, const int *ldb);
															
 
																+extern void ctrmv_(const char *uplo, const char *transA, const char *diag,
															
 
																+                 const int *n, const complex float *A, const int *lda, complex float *X,
															
 
																+                 const int *incX);
															
 
																+extern void caxpy_(const int *n, const complex float *alpha, complex float *X, const int *incX,
															
 
																+		complex float *Y, const int *incy);
															
 
																+extern void zaxpy_(const int *n, const complex double *alpha, complex double *X, const int *incX,
															
 
																+		complex double *Y, const int *incy);
															
 
																+extern int icamax_(const int *n, complex float *X, const int *incX);
															
 
																+extern int izamax_(const int *n, complex double *X, const int *incX);
															
 
																+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
															
 
																+extern complex double cdotu_(const int *n, const complex float *x, const int *incx, const complex float *y, const int *incy);
															
 
																+extern complex double zdotu_(const int *n, const complex double *x, const int *incx, const complex double *y, const int *incy);
															
 
																+extern void cswap_(const int *n, complex float *x, const int *incx, complex float *y, const int *incy);
															
 
																+extern void zswap_(const int *n, complex double *x, const int *incx, complex double *y, const int *incy);
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																+#endif /* __BLAS_COMPLEX_H__ */
															
--- a/examples/lu/clu.c
+++ b/examples/lu/clu.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "xlu.c"
															
--- a/examples/lu/clu_implicit.c
+++ b/examples/lu/clu_implicit.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "xlu_implicit.c"
															
--- a/examples/lu/clu_implicit_pivot.c
+++ b/examples/lu/clu_implicit_pivot.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "xlu_implicit_pivot.c"
															
--- a/examples/lu/clu_kernels.c
+++ b/examples/lu/clu_kernels.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "xlu_kernels.c"
															
--- a/examples/lu/clu_pivot.c
+++ b/examples/lu/clu_pivot.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "xlu_pivot.c"
															
--- a/examples/lu/complex_double.h
+++ b/examples/lu/complex_double.h
@@ -0,0 +1,52 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+
															
 
																+#include <complex.h>
															
 
																+#include "blas_complex.h"
															
 
																+#define TYPE complex double
															
 
																+#define CUBLAS_TYPE cuDoubleComplex
															
 
																+
															
 
																+#define STARPU_LU(name)       starpu_zlu_##name
															
 
																+#define COMPLEX_LU
															
 
																+
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																+#include <magmablas.h>
															
 
																+#define CUBLAS_GEMM	magmablas_zgemm
															
 
																+#define CUBLAS_TRSM	magmablas_ztrsm
															
 
																+#else
															
 
																+#define CUBLAS_GEMM	cublasZgemm
															
 
																+#define CUBLAS_TRSM	cublasZtrsm
															
 
																+#endif
															
 
																+
															
 
																+#define CUBLAS_SCAL	cublasZscal
															
 
																+#define CUBLAS_GER	cublasZgeru
															
 
																+#define CUBLAS_SWAP	cublasZswap
															
 
																+#define CUBLAS_IAMAX	cublasIzamax
															
 
																+
															
 
																+#define CPU_GEMM	ZGEMM
															
 
																+#define CPU_TRSM	ZTRSM
															
 
																+#define CPU_SCAL	ZSCAL
															
 
																+#define CPU_GER		ZGERU
															
 
																+#define CPU_SWAP	ZSWAP
															
 
																+
															
 
																+#define CPU_TRMM	ZTRMM
															
 
																+#define CPU_AXPY	ZAXPY
															
 
																+#define CPU_ASUM	DZASUM
															
 
																+#define CPU_IAMAX	IZAMAX
															
 
																+
															
 
																+#define PIVOT_THRESHHOLD	10e-5
															
--- a/examples/lu/complex_float.h
+++ b/examples/lu/complex_float.h
@@ -0,0 +1,52 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+
															
 
																+#include <complex.h>
															
 
																+#include "blas_complex.h"
															
 
																+#define TYPE complex float
															
 
																+#define CUBLAS_TYPE cuComplex
															
 
																+
															
 
																+#define STARPU_LU(name)       starpu_clu_##name
															
 
																+#define COMPLEX_LU
															
 
																+
															
 
																+#ifdef STARPU_HAVE_MAGMA
															
 
																+#include <magmablas.h>
															
 
																+#define CUBLAS_GEMM	magmablas_cgemm
															
 
																+#define CUBLAS_TRSM	magmablas_ctrsm
															
 
																+#else
															
 
																+#define CUBLAS_GEMM	cublasCgemm
															
 
																+#define CUBLAS_TRSM	cublasCtrsm
															
 
																+#endif
															
 
																+
															
 
																+#define CUBLAS_SCAL	cublasCscal
															
 
																+#define CUBLAS_GER	cublasCgeru
															
 
																+#define CUBLAS_SWAP	cublasCswap
															
 
																+#define CUBLAS_IAMAX	cublasIcamax
															
 
																+
															
 
																+#define CPU_GEMM	CGEMM
															
 
																+#define CPU_TRSM	CTRSM
															
 
																+#define CPU_SCAL	CSCAL
															
 
																+#define CPU_GER		CGERU
															
 
																+#define CPU_SWAP	CSWAP
															
 
																+
															
 
																+#define CPU_TRMM	CTRMM
															
 
																+#define CPU_AXPY	CAXPY
															
 
																+#define CPU_ASUM	SCASUM
															
 
																+#define CPU_IAMAX	ICAMAX
															
 
																+
															
 
																+#define PIVOT_THRESHHOLD	10e-5
															
--- a/examples/lu/double.h
+++ b/examples/lu/double.h
@@ -16,6 +16,7 @@
 
																  */
															
 
																 #define TYPE double
															
 
																+#define CUBLAS_TYPE TYPE
															
 
																 #define STARPU_LU(name)       starpu_dlu_##name
															
--- a/examples/lu/float.h
+++ b/examples/lu/float.h
@@ -17,6 +17,7 @@
 
																 #define TYPE float
															
 
																+#define CUBLAS_TYPE TYPE
															
 
																 #define STARPU_LU(name)       starpu_slu_##name
															
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -164,6 +164,10 @@ static void init_matrix(void)
 
																 		for (i = 0; i < size; i++)
															
 
																 		{
															
 
																 			A[i + j*size] = (TYPE)starpu_drand48();
															
 
																+#ifdef COMPLEX_LU
															
 
																+			/* also randomize the imaginary component for complex number cases */
															
 
																+			A[i + j*size] += (TYPE)(I*starpu_drand48());
															
 
																+#endif
															
 
																 		}
															
 
																 	}
															
@@ -249,11 +253,20 @@ static void check_result(void)
 
																 	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
															
 
																 	display_matrix(L, size, size, "Residuals");
															
 
																+#ifdef COMPLEX_LU
															
 
																+	double err = CPU_ASUM(size*size, L, 1);
															
 
																+	int max = CPU_IAMAX(size*size, L, 1);
															
 
																+	TYPE l_max = L[max];
															
 
																+
															
 
																+	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
															
 
																+	FPRINTF(stderr, "Max error : %e\n", sqrt(creal(l_max)*creal(l_max)+cimag(l_max)*cimag(l_max)));
															
 
																+#else
															
 
																 	TYPE err = CPU_ASUM(size*size, L, 1);
															
 
																 	int max = CPU_IAMAX(size*size, L, 1);
															
 
																 	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
															
 
																 	FPRINTF(stderr, "Max error : %e\n", L[max]);
															
 
																+#endif
															
 
																 	double residual = frobenius_norm(L, size);
															
 
																 	double matnorm = frobenius_norm(A_saved, size);
															
--- a/examples/lu/lu_example_complex_double.c
+++ b/examples/lu/lu_example_complex_double.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "lu_example.c"
															
--- a/examples/lu/lu_example_complex_float.c
+++ b/examples/lu/lu_example_complex_float.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_float.h"
															
 
																+#include "lu_example.c"
															
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -22,6 +22,11 @@
 
																 #define xstr(s)        str(s)
															
 
																 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+static const TYPE p1 =  1.0f;
															
 
																+static const TYPE m1 = -1.0f;
															
 
																+#endif
															
 
																+
															
 
																 /*
															
 
																  *   U22 
															
 
																  */
															
@@ -54,10 +59,10 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
																 			break;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-		case 1:
															
 
																+		case 1: {
															
 
																 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
															
 
																-				(TYPE)-1.0, right, ld21, left, ld12,
															
 
																-				(TYPE)1.0f, center, ld22);
															
 
																+				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
															
 
																+				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
															
 
																 			status = cublasGetError();
															
 
																 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
@@ -67,6 +72,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
																 				STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 			break;
															
 
																+		}
															
 
																 #endif
															
 
																 		default:
															
 
																 			STARPU_ABORT();
															
@@ -140,7 +146,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
															
 
																-					(TYPE)1.0, sub11, ld11, sub12, ld12);
															
 
																+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
															
 
																 			status = cublasGetError();
															
 
																 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
															
@@ -221,7 +227,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 
																 #ifdef STARPU_USE_CUDA
															
 
																 		case 1:
															
 
																 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
															
 
																-					(TYPE)1.0, sub11, ld11, sub21, ld21);
															
 
																+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
															
 
																 			status = cublasGetError();
															
 
																 			if (status != CUBLAS_STATUS_SUCCESS)
															
@@ -307,17 +313,19 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																 				TYPE pivot;
															
 
																+				TYPE inv_pivot;
															
 
																 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
															
 
																 				cudaStreamSynchronize(0);
															
 
																 				STARPU_ASSERT(pivot != 0.0);
															
 
																-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
															
 
																+				inv_pivot = 1.0/pivot;
															
 
																+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																-						&sub11[(z+1)+z*ld], 1,
															
 
																-						&sub11[z+(z+1)*ld], ld,
															
 
																-						&sub11[(z+1) + (z+1)*ld],ld);
															
 
																+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
															
 
																+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
															
 
																+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
															
 
																+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
															
 
																 			}
															
 
																 			cudaThreadSynchronize();
															
@@ -423,20 +431,21 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
																 			for (z = 0; z < nx; z++)
															
 
																 			{
															
 
																 				TYPE pivot;
															
 
																+				TYPE inv_pivot;
															
 
																 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
															
 
																 				cudaStreamSynchronize(0);
															
 
																 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
															
 
																 				{
															
 
																 					/* find the pivot */
															
 
																-					int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
															
 
																+					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
															
 
																 					ipiv[z + first] = piv_ind + z + first;
															
 
																 					/* swap if needed */
															
 
																 					if (piv_ind != 0)
															
 
																 					{
															
 
																-						CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
															
 
																+						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
															
 
																 					}
															
 
																 					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
															
@@ -445,12 +454,13 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
																 				STARPU_ASSERT(pivot != 0.0);
															
 
																-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
															
 
																+				inv_pivot = 1.0/pivot;
															
 
																+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
															
 
																-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
															
 
																-						&sub11[(z+1)+z*ld], 1,
															
 
																-						&sub11[z+(z+1)*ld], ld,
															
 
																-						&sub11[(z+1) + (z+1)*ld],ld);
															
 
																+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
															
 
																+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
															
 
																+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
															
 
																+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
															
 
																 			}
															
@@ -534,7 +544,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
																 				unsigned rowpiv = ipiv[row+first] - first;
															
 
																 				if (rowpiv != row)
															
 
																 				{
															
 
																-					CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
															
 
																+					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
															
 
																 				}
															
 
																 			}
															
--- a/examples/lu/zlu.c
+++ b/examples/lu/zlu.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "xlu.c"
															
--- a/examples/lu/zlu_implicit.c
+++ b/examples/lu/zlu_implicit.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "xlu_implicit.c"
															
--- a/examples/lu/zlu_implicit_pivot.c
+++ b/examples/lu/zlu_implicit_pivot.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "xlu_implicit_pivot.c"
															
--- a/examples/lu/zlu_kernels.c
+++ b/examples/lu/zlu_kernels.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "xlu_kernels.c"
															
--- a/examples/lu/zlu_pivot.c
+++ b/examples/lu/zlu_pivot.c
@@ -0,0 +1,19 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include "complex_double.h"
															
 
																+#include "xlu_pivot.c"
															
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -14,12 +14,30 @@
 
																 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																 noinst_PROGRAMS =				\
															
 
																-  matrix-mult
															
 
																+  matrix-mult stencil5
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+noinst_PROGRAMS +=				\
															
 
																+  cholesky/cholesky
															
 
																+endif
															
 
																 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
															
 
																 AM_CPPFLAGS =						\
															
 
																   -I$(top_srcdir)/include				\
															
 
																+  -I$(top_srcdir)/examples				\
															
 
																   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
															
 
																 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
															
 
																+
															
 
																+if !NO_BLAS_LIB
															
 
																+cholesky_cholesky_SOURCES	=		\
															
 
																+	cholesky/cholesky.c		\
															
 
																+	cholesky/cholesky_models.c	\
															
 
																+	cholesky/cholesky_kernels.c	\
															
 
																+	$(top_srcdir)/examples/common/blas.c
															
 
																+
															
 
																+cholesky_cholesky_LDADD	=	\
															
 
																+	$(STARPU_BLAS_LDFLAGS)
															
 
																+endif
															
 
																+
															
--- a/gcc-plugin/examples/stencil5.c
+++ b/gcc-plugin/examples/stencil5.c
@@ -0,0 +1,127 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <stdlib.h>
															
 
																+#include <math.h>
															
 
																+
															
 
																+#ifndef STARPU_GCC_PLUGIN
															
 
																+# error must be compiled with the StarPU GCC plug-in
															
 
																+#endif
															
 
																+
															
 
																+/* Definition of the StarPU task and its CPU implementation.  */
															
 
																+static void stencil5(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
															
 
																+	__attribute__ ((task));
															
 
																+
															
 
																+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
															
 
																+	__attribute__ ((task_implementation ("cpu", stencil5)));
															
 
																+
															
 
																+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
															
 
																+{
															
 
																+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
															
 
																+}
															
 
																+
															
 
																+#define NITER_DEF 20000
															
 
																+#define X         10
															
 
																+#define Y         10
															
 
																+
															
 
																+int display = 0;
															
 
																+int niter = NITER_DEF;
															
 
																+
															
 
																+static void parse_args(int argc, char **argv)
															
 
																+{
															
 
																+	int i;
															
 
																+	for (i = 1; i < argc; i++) {
															
 
																+		if (strcmp(argv[i], "-iter") == 0) {
															
 
																+			char *argptr;
															
 
																+			niter = strtol(argv[++i], &argptr, 10);
															
 
																+		}
															
 
																+		if (strcmp(argv[i], "-display") == 0) {
															
 
																+			display = 1;
															
 
																+		}
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static float my_rand (void)
															
 
																+{
															
 
																+	return (float) rand () / (float) RAND_MAX;
															
 
																+}
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        int x, y;
															
 
																+        float mean=0;
															
 
																+        float matrix[X][Y];
															
 
																+
															
 
																+        parse_args(argc, argv);
															
 
																+
															
 
																+	srand (time (NULL));
															
 
																+        for(x = 0; x < X; x++) {
															
 
																+                for (y = 0; y < Y; y++) {
															
 
																+                        matrix[x][y] = my_rand () * 100;
															
 
																+                        mean += matrix[x][y];
															
 
																+                }
															
 
																+        }
															
 
																+        mean /= (x*y);
															
 
																+
															
 
																+        if (display) {
															
 
																+                fprintf(stdout, "mean=%f\n", mean);
															
 
																+                for(x = 0; x < X; x++) {
															
 
																+                        for (y = 0; y < Y; y++) {
															
 
																+                                fprintf(stdout, "%3f ", matrix[x][y]);
															
 
																+                        }
															
 
																+                        fprintf(stdout, "\n");
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+#pragma starpu initialize
															
 
																+
															
 
																+        for(x = 0; x < X; x++) {
															
 
																+		for (y = 0; y < Y; y++) {
															
 
																+#pragma starpu register &matrix[x][y] 1
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	while(niter--) {
															
 
																+                for (x = 1; x < X-1; x++) {
															
 
																+                        for (y = 1; y < Y-1; y++) {
															
 
																+                                stencil5(&matrix[x][y], &matrix[x-1][y], &matrix[x+1][y],
															
 
																+					 &matrix[x][y-1], &matrix[x][y+1]);
															
 
																+                        }
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+#pragma starpu wait
															
 
																+
															
 
																+        for(x = 0; x < X; x++) {
															
 
																+                for (y = 0; y < Y; y++) {
															
 
																+#pragma starpu unregister &matrix[x][y]
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+#pragma starpu shutdown
															
 
																+
															
 
																+        if (display) {
															
 
																+                fprintf(stdout, "mean=%f\n", mean);
															
 
																+                for(x = 0; x < X; x++) {
															
 
																+                        for (y = 0; y < Y; y++) {
															
 
																+                                fprintf(stdout, "%3f ", matrix[x][y]);
															
 
																+                        }
															
 
																+                        fprintf(stdout, "\n");
															
 
																+                }
															
 
																+        }
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -690,6 +690,12 @@ handle_task_attribute (tree *node, tree name, tree args,
 
																 	      "%<task%> attribute only applies to functions");
															
 
																   else
															
 
																     {
															
 
																+      if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (fn))))
															
 
																+	/* Raise an error but keep going to avoid spitting out too many
															
 
																+	   errors at the user's face.  */
															
 
																+	error_at (DECL_SOURCE_LOCATION (fn),
															
 
																+		  "task return type must be %<void%>");
															
 
																+
															
 
																       /* This is a function declaration for something local to this
															
 
																 	 translation unit, so add the `task' attribute to FN.  */
															
 
																       *no_add_attrs = false;
															
--- a/gcc-plugin/tests/run-test.in
+++ b/gcc-plugin/tests/run-test.in
@@ -261,10 +261,15 @@ unsatisfied directives."
 
																 (define (executable-file source)
															
 
																   "Return the name of the executable file corresponding to SOURCE."
															
 
																-  (let ((dot (string-rindex source #\.)))
															
 
																-    (if dot
															
 
																-        (substring source 0 dot)
															
 
																-        (string-append source ".exe"))))
															
 
																+  (let* ((dot (string-rindex source #\.))
															
 
																+         (exe (if dot
															
 
																+                  (substring source 0 dot)
															
 
																+                  (string-append source ".exe")))
															
 
																+         )
															
 
																+  (if (string-prefix? %srcdir exe)
															
 
																+      (string-append %builddir (substring exe (string-length %srcdir)))
															
 
																+      exe
															
 
																+      )))
															
 
																 (define (compile/match file cc cflags ldflags)
															
 
																   "Read directives from FILE, and compiler/link/run it.  Make sure directives
															
--- a/gcc-plugin/tests/task-errors.c
+++ b/gcc-plugin/tests/task-errors.c
@@ -49,6 +49,9 @@ static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string cons
 
																 static void my_task_with_a_body (int foo, char *bar)
															
 
																   __attribute__ ((task, unused));
															
 
																+extern int my_task_not_void (int foo) /* (error "return type") */
															
 
																+  __attribute__ ((task));
															
 
																+
															
 
																 static void
															
 
																 my_task_cpu (int foo, float *bar)
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -78,15 +78,13 @@ struct starpu_sched_policy_s {
 
																 	/* Insert a task into the scheduler. */
															
 
																         int (*push_task)(struct starpu_task *, unsigned);
															
 
																-	/* Notify the scheduler that a task was pushed on the worker. This
															
 
																-	 * method is called when a task that was explicitely assigned to a
															
 
																-	 * worker is scheduled. This method therefore permits to keep the state
															
 
																-	 * of of the scheduler coherent even when StarPU bypasses the
															
 
																-	 * scheduling strategy. */
															
 
																+	/* Notify the scheduler that a task was directly pushed to the worker
															
 
																+	 * without going through the scheduler. This method is called when a
															
 
																+	 * task is explicitely assigned to a worker. This method therefore
															
 
																+	 * permits to keep the timing state of the scheduler coherent even
															
 
																+	 * when StarPU bypasses the scheduling strategy. */
															
 
																 	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned);
															
 
																-	/* Insert a priority task into the scheduler. */
															
 
																-        int (*push_prio_task)(struct starpu_task *, unsigned);
															
 
																 	/* Get a task from the scheduler. The mutex associated to the worker is
															
 
																 	 * already taken when this method is called. */
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -429,12 +429,12 @@ static void benchmark_all_gpu_devices(void)
 
																 #endif
															
 
																 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+	ncpus = config->topology.ncpus;
															
 
																 	/* TODO: measure bandwidth between GPU-GPU */
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-        cudaGetDeviceCount(&ncuda);
															
 
																+	ncuda = _starpu_get_cuda_device_count();
															
 
																 	for (i = 0; i < ncuda; i++)
															
 
																 	{
															
 
																 		fprintf(stderr," CUDA %d...", i);
															
@@ -481,7 +481,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
																 	char hostname[32];
															
 
																 	char *forced_hostname = getenv("STARPU_HOSTNAME");
															
 
																 	if (forced_hostname && forced_hostname[0])
															
 
																-		snprintf(hostname, sizeof(hostname), forced_hostname);
															
 
																+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
															
 
																 	else
															
 
																 		gethostname(hostname, sizeof(hostname));
															
 
																 	strncat(path, ".", maxlen);
															
@@ -509,11 +509,11 @@ static void load_bus_affinity_file_content(void)
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
															
 
																 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+	ncpus = config->topology.ncpus;
															
 
																         int gpu;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-        cudaGetDeviceCount(&ncuda);
															
 
																+	ncuda = _starpu_get_cuda_device_count();
															
 
																 	for (gpu = 0; gpu < ncuda; gpu++)
															
 
																 	{
															
 
																 		int ret;
															
@@ -1020,9 +1020,9 @@ static void check_bus_config_file()
 
																                 fclose(f);
															
 
																                 // Loading current configuration
															
 
																-                ncpus = _starpu_topology_get_nhwcpu(config);
															
 
																+                ncpus = config->topology.ncpus;
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																-                cudaGetDeviceCount(&ncuda);
															
 
																+		ncuda = _starpu_get_cuda_device_count();
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																                 nopencl = _starpu_opencl_get_device_count();
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -204,17 +204,117 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 
																 	}
															
 
																 }
															
 
																-static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
															
 
																+static void parse_arch(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history, unsigned archmin, unsigned archmax, int skiparch)
															
 
																 {
															
 
																-	unsigned arch;
															
 
																-	unsigned nimpl;
															
 
																-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
															
 
																-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
															
 
																-			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
															
 
																+	unsigned arch, impl;
															
 
																+	struct starpu_per_arch_perfmodel_t dummy;
															
 
																+	int nimpls, implmax, skipimpl;
															
 
																+	unsigned ret;
															
 
																+	
															
 
																+
															
 
																+	for (arch = archmin; arch < archmax; arch++) {
															
 
																+		_starpu_drop_comments(f);
															
 
																+		ret = fscanf(f, "%d\n", &nimpls);
															
 
																+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
															
 
																+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
															
 
																+		for (impl = 0; impl < implmax; impl++) {
															
 
																+			parse_per_arch_model_file(f, &model->per_arch[arch][impl], scan_history);
															
 
																+		}
															
 
																+		if (skipimpl > 0) {
															
 
																+			for (impl = 0; impl < skipimpl; impl++) {
															
 
																+				parse_per_arch_model_file(f, &dummy, 0);
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	if (skiparch > 0) {
															
 
																+		_starpu_drop_comments(f);
															
 
																+		ret = fscanf(f, "%d\n", &nimpls);
															
 
																+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
															
 
																+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
															
 
																+		for (arch = 0; arch < skiparch; arch ++) {
															
 
																+			for (impl = 0; impl < implmax; impl++) {
															
 
																+				parse_per_arch_model_file(f, &dummy, 0);
															
 
																+			}
															
 
																+			if (skipimpl > 0) {
															
 
																+				for (impl = 0; impl < skipimpl; impl++) {
															
 
																+					parse_per_arch_model_file(f, &dummy, 0);
															
 
																+				}
															
 
																+			}
															
 
																 		}
															
 
																 	}
															
 
																 }
															
 
																+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
															
 
																+{
															
 
																+	unsigned ret;
															
 
																+	unsigned archmin = 0;
															
 
																+	unsigned max_gordondevs = 1; /* XXX : we need a STARPU_MAXGORDONDEVS cst */
															
 
																+	unsigned narchs;
															
 
																+	int nimpls;
															
 
																+
															
 
																+	/* We could probably write a clean loop here, but the code would not
															
 
																+	 * really be easier to read. */
															
 
																+
															
 
																+	/* Parsing CPUs */
															
 
																+	_starpu_drop_comments(f);
															
 
																+	ret = fscanf(f, "%u\n", &narchs);
															
 
																+	STARPU_ASSERT(ret == 1);
															
 
																+
															
 
																+	_STARPU_DEBUG("Parsing %u CPUs\n", narchs);
															
 
																+	if (narchs > 0)
															
 
																+	{
															
 
																+		parse_arch(f, model, scan_history,
															
 
																+				archmin,
															
 
																+				STARPU_MIN(narchs, STARPU_MAXCPUS),
															
 
																+				narchs - STARPU_MAXCPUS);
															
 
																+	}
															
 
																+
															
 
																+	/* Parsing CUDA devs */
															
 
																+	_starpu_drop_comments(f);
															
 
																+	ret = fscanf(f, "%u\n", &narchs);
															
 
																+	STARPU_ASSERT(ret == 1);
															
 
																+	_STARPU_DEBUG("Parsing %u CUDA devices\n", narchs);
															
 
																+	if (narchs > 0)
															
 
																+	{
															
 
																+		archmin += STARPU_MAXCPUS;
															
 
																+		parse_arch(f, model, scan_history,
															
 
																+				archmin,
															
 
																+				archmin + STARPU_MIN(narchs, STARPU_MAXCUDADEVS),
															
 
																+				narchs - STARPU_MAXCUDADEVS);
															
 
																+	}
															
 
																+
															
 
																+	/* Parsing OpenCL devs */
															
 
																+	_starpu_drop_comments(f);
															
 
																+	ret = fscanf(f, "%u\n", &narchs);
															
 
																+	STARPU_ASSERT(ret == 1);
															
 
																+
															
 
																+	_STARPU_DEBUG("Parsing %u OpenCL devices\n", narchs);
															
 
																+	if (narchs > 0)
															
 
																+	{
															
 
																+		archmin += STARPU_MAXCUDADEVS;
															
 
																+		parse_arch(f, model, scan_history,
															
 
																+				archmin,
															
 
																+				archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
															
 
																+				narchs - STARPU_MAXOPENCLDEVS);
															
 
																+	}
															
 
																+
															
 
																+	/* Parsing Gordon implementations */
															
 
																+	_starpu_drop_comments(f);
															
 
																+	ret = fscanf(f, "%u\n", &narchs);
															
 
																+	STARPU_ASSERT(ret == 1);
															
 
																+
															
 
																+	_STARPU_DEBUG("Parsing %u Gordon devices\n", narchs);
															
 
																+	if (narchs > 0)
															
 
																+	{
															
 
																+		archmin += STARPU_MAXOPENCLDEVS;
															
 
																+		parse_arch(f, model, scan_history,
															
 
																+				archmin,
															
 
																+				archmin + max_gordondevs,
															
 
																+				narchs - max_gordondevs);
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
															
 
																 {
															
@@ -235,7 +335,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 
																 		}
															
 
																 	}
															
 
																+	if (nentries == 0)
															
 
																+		return;
															
 
																 	/* header */
															
 
																+	char archname[32];
															
 
																+	starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
															
 
																+	fprintf(f, "# Model for %s\n", archname);
															
 
																 	fprintf(f, "# number of entries\n%u\n", nentries);
															
 
																 	dump_reg_model(f, model, arch, nimpl);
															
@@ -250,23 +355,123 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 
																 			ptr = ptr->next;
															
 
																 		}
															
 
																 	}
															
 
																+
															
 
																+	fprintf(f, "\n##################\n");
															
 
																 }
															
 
																-static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
															
 
																+static unsigned get_n_entries(struct starpu_perfmodel_t *model, unsigned arch, unsigned impl)
															
 
																 {
															
 
																-	fprintf(f, "#################\n");
															
 
																+	struct starpu_per_arch_perfmodel_t *per_arch_model;
															
 
																+	per_arch_model = &model->per_arch[arch][impl];
															
 
																+	/* count the number of elements in the lists */
															
 
																+	struct starpu_history_list_t *ptr = NULL;
															
 
																+	unsigned nentries = 0;
															
 
																+
															
 
																+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
															
 
																+	{
															
 
																+		/* Dump the list of all entries in the history */
															
 
																+		ptr = per_arch_model->list;
															
 
																+		while(ptr) {
															
 
																+			nentries++;
															
 
																+			ptr = ptr->next;
															
 
																+		}
															
 
																+	}
															
 
																+	return nentries;
															
 
																+}
															
 
																+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
															
 
																+{
															
 
																+	unsigned number_of_archs[4] = { 0, 0, 0, 0};
															
 
																 	unsigned arch;
															
 
																 	unsigned nimpl;
															
 
																+	unsigned idx = 0;
															
 
																+
															
 
																+	/* Finding the number of archs to write for each kind of device */
															
 
																 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
															
 
																 	{
															
 
																+		switch (arch)
															
 
																+		{
															
 
																+			case STARPU_CUDA_DEFAULT:
															
 
																+			case STARPU_OPENCL_DEFAULT:
															
 
																+			case STARPU_GORDON_DEFAULT:
															
 
																+				idx++;
															
 
																+				break;
															
 
																+			default:
															
 
																+				break;
															
 
																+		}
															
 
																+
															
 
																+		unsigned nentries = 0;
															
 
																+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																+		{
															
 
																+			nentries = get_n_entries(model, arch, nimpl) != 0;
															
 
																+			if (nentries > 0)
															
 
																+			{
															
 
																+				number_of_archs[idx]++;
															
 
																+				break;
															
 
																+			}
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	/* Writing stuff */
															
 
																+	char *name;
															
 
																+	unsigned substract_to_arch = 0;
															
 
																+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
															
 
																+	{
															
 
																+		switch (arch)
															
 
																+		{
															
 
																+			case STARPU_CPU_DEFAULT:
															
 
																+				name = "CPU";
															
 
																+				fprintf(f, "##################\n");
															
 
																+				fprintf(f, "# %ss\n", name);
															
 
																+				fprintf(f, "# number of %s architectures\n", name);
															
 
																+				fprintf(f, "%u\n", number_of_archs[0]);
															
 
																+				break;
															
 
																+			case STARPU_CUDA_DEFAULT:
															
 
																+				name = "CUDA";
															
 
																+				substract_to_arch = STARPU_MAXCPUS;
															
 
																+				fprintf(f, "##################\n");
															
 
																+				fprintf(f, "# %ss\n", name);
															
 
																+				fprintf(f, "# number of %s architectures\n", name);
															
 
																+				fprintf(f, "%u\n", number_of_archs[1]);
															
 
																+				break;
															
 
																+			case STARPU_OPENCL_DEFAULT:
															
 
																+				name = "OPENCL";
															
 
																+				substract_to_arch += STARPU_MAXCUDADEVS;
															
 
																+				fprintf(f, "##################\n");
															
 
																+				fprintf(f, "# %ss\n", name);
															
 
																+				fprintf(f, "# number of %s architectures\n", name);
															
 
																+				fprintf(f, "%u\n", number_of_archs[2]);
															
 
																+				break;
															
 
																+			case STARPU_GORDON_DEFAULT:
															
 
																+				name = "GORDON";
															
 
																+				substract_to_arch += STARPU_MAXOPENCLDEVS;
															
 
																+				fprintf(f, "##################\n");
															
 
																+				fprintf(f, "# %ss\n", name);
															
 
																+				fprintf(f, "# number of %s architectures\n", name);
															
 
																+				fprintf(f, "%u\n", number_of_archs[3]);
															
 
																+				break;
															
 
																+			default:
															
 
																+				break;
															
 
																+		}
															
 
																+
															
 
																 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
															
 
																 		{
															
 
																-			char archname[32];
															
 
																-			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
															
 
																-			fprintf(f, "# Model for %s\n", archname);
															
 
																+			if (get_n_entries(model, arch, nimpl) == 0)
															
 
																+				break;
															
 
																+
															
 
																+		}
															
 
																+		unsigned max_impl = nimpl;
															
 
																+
															
 
																+		if (max_impl == 0)
															
 
																+			continue;
															
 
																+
															
 
																+		fprintf(f, "###########\n");
															
 
																+		fprintf(f, "# %s_%u\n", name, arch - substract_to_arch);
															
 
																+		fprintf(f, "# number of implementations\n");
															
 
																+		fprintf(f, "%u\n", max_impl);
															
 
																+		for (nimpl = 0; nimpl < max_impl; nimpl++)
															
 
																+		{
															
 
																 			dump_per_arch_model_file(f, model, arch, nimpl);
															
 
																-			fprintf(f, "\n##################\n");
															
 
																 		}
															
 
																 	}
															
 
																 }
															
@@ -300,7 +505,7 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 
																 	char hostname[32];
															
 
																 	char *forced_hostname = getenv("STARPU_HOSTNAME");
															
 
																 	if (forced_hostname && forced_hostname[0])
															
 
																-		snprintf(hostname, sizeof(hostname), forced_hostname);
															
 
																+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
															
 
																 	else
															
 
																 		gethostname(hostname, sizeof(hostname));
															
 
																 	strncat(path, ".", maxlen);
															
@@ -351,7 +556,7 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 
																 	char hostname[32];
															
 
																 	char *forced_hostname = getenv("STARPU_HOSTNAME");
															
 
																 	if (forced_hostname && forced_hostname[0])
															
 
																-		snprintf(hostname, sizeof(hostname), forced_hostname);
															
 
																+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
															
 
																 	else
															
 
																 		gethostname(hostname, sizeof(hostname));
															
 
																 	strncat(path, ".", maxlen);
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -87,7 +87,7 @@ static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct
 
																 			cl->opencl_funcs[nimpl] == NULL);
															
 
																 	case STARPU_GORDON_WORKER:
															
 
																 		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
															
 
																-			cl->gordon_funcs[nimpl] == NULL);
															
 
																+			cl->gordon_funcs[nimpl] == 0);
															
 
																 	default:
															
 
																 		return 0;
															
 
																 	}
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -357,6 +357,11 @@ void _starpu_opencl_init(void)
 
																                 // Get location of OpenCl kernel source files
															
 
																                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
															
 
																+		if (nb_devices > STARPU_MAXOPENCLDEVS) {
															
 
																+			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
															
 
																+			nb_devices = STARPU_MAXOPENCLDEVS;
															
 
																+		}
															
 
																+
															
 
																                 // initialise internal structures
															
 
																                 for(i=0 ; i<nb_devices ; i++) {
															
 
																                         contexts[i] = NULL;
															
@@ -538,7 +543,6 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
																 	struct timespec codelet_start, codelet_end;
															
 
																-	int workerid = args->workerid;
															
 
																 	STARPU_ASSERT(task);
															
 
																 	struct starpu_codelet_t *cl = task->cl;
															
 
																 	STARPU_ASSERT(cl);
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -478,6 +478,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
																 					) {
															
 
																 				ntasks_best_end = ntasks_end;
															
 
																 				ntasks_best = worker;
															
 
																+
															
 
																 			}
															
 
																 			if (local_task_length[worker_in_ctx] == -1.0)
															
@@ -492,7 +493,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
																 				unknown = 1;
															
 
																 			if (unknown)
															
 
																-				continue;
															
 
																+					continue;
															
 
																 			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
															
@@ -501,6 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
																 				/* a better solution was found */
															
 
																 				best_exp_end = exp_end[worker_in_ctx];
															
 
																 				best_impl = nimpl;
															
 
																+
															
 
																 			}
															
 
																 			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
															
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -21,150 +21,454 @@
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 #include <hwloc.h>
															
 
																-#endif
															
 
																-#ifdef STARPU_HAVE_HWLOC
															
 
																-/* This function returns 1 the subtree induced by obj only contains CPU
															
 
																- * workers, otherwise 0 is returned. This function registers all valid worker
															
 
																- * combination below obj. The id of the CPU workers are put in the worker_array
															
 
																- * and their count is put in the worker_cnt pointer. */
															
 
																-static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array, int *worker_cnt)
															
 
																+/* tree_t
															
 
																+ * ======
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
															
 
																+ * object and storing every workers it contained in every sub-trees by recursion.
															
 
																+ *
															
 
																+ * Fields
															
 
																+ * ======
															
 
																+ * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
															
 
																+ *
															
 
																+ * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
															
 
																+ 			or in this very object.
															
 
																+ *
															
 
																+ * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
															
 
																+ */
															
 
																+
															
 
																+typedef struct tree_s{
															
 
																+    hwloc_obj_t obj;
															
 
																+    unsigned nb_workers;
															
 
																+    int *workers;
															
 
																+} tree_t;
															
 
																+
															
 
																+
															
 
																+/* gather_trees
															
 
																+ * ============
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * Gather all the workers of every source tree in one target tree.
															
 
																+ * We assume the target array of workers is big enough to contain all the workers.
															
 
																+ *
															
 
																+ * Arguments
															
 
																+ * =========
															
 
																+ * target_tree		(input, output)
															
 
																+ *			Pointer to the tree which will contain all the workers of every source.
															
 
																+ *
															
 
																+ * source_trees		(input)
															
 
																+ *			Array of trees we want to combine in a unique tree.
															
 
																+ *
															
 
																+ * nb_source_trees	(input)
															
 
																+ *			Number of trees we want to combine (size of the array).
															
 
																+ */
															
 
																+
															
 
																+static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
															
 
																+{
															
 
																+    unsigned tree_id, worker_id, index = 0;
															
 
																+    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
															
 
																+	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
															
 
																+	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
															
 
																+}
															
 
																+
															
 
																+/* assign_multiple_trees
															
 
																+ * ========================
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * Assign every tree which is large enough (greater than min_size) and merge small ones.
															
 
																+ * If there is no tree large enough to be assigned any more, we return.
															
 
																+ *
															
 
																+ * Return value
															
 
																+ * ============
															
 
																+ * The number of workers assigned during the function.
															
 
																+ *
															
 
																+ * Arguments
															
 
																+ * =========
															
 
																+ * trees		(input, output)
															
 
																+ *			In entry, array of trees to assign. In the end at most one tree still contains workers.
															
 
																+ *
															
 
																+ * nb_trees		(input)
															
 
																+ *			The number of trees (size of the array).
															
 
																+ *
															
 
																+ * min_size		(input)
															
 
																+ *			Minimum size of a combined worker.
															
 
																+ *
															
 
																+ * max_size		(input)
															
 
																+ *			Maximum size of a combined worker.
															
 
																+ */
															
 
																+
															
 
																+static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
															
 
																 {
															
 
																-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    unsigned short complete = 0;
															
 
																+    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
															
 
																+
															
 
																+    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
															
 
																+	nb_workers_total += trees[tree_id].nb_workers;;
															
 
																-	/* Is this a leaf ? (eg. a PU for hwloc) */
															
 
																-	int is_leaf = !hwloc_compare_types(config->cpu_depth, obj->depth);
															
 
																+    while(!complete)
															
 
																+    {
															
 
																+	complete = 1;
															
 
																-	if (is_leaf)
															
 
																+	/* First we manage to assign every subtree large enough to be assigned alone */
															
 
																+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
															
 
																 	{
															
 
																-		struct starpu_worker_s *worker = obj->userdata;
															
 
																+	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
															
 
																+		continue;
															
 
																-		/* If this is a CPU worker, append its id at the end of the
															
 
																-		 * list */
															
 
																-		if (worker && worker->arch == STARPU_CPU_WORKER)
															
 
																+	    nb_workers_tree = trees[tree_id].nb_workers;
															
 
																+
															
 
																+	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
															
 
																+	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
															
 
																+	    {
															
 
																+		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
															
 
																+		STARPU_ASSERT(ret >= 0);
															
 
																+		nb_workers_assigned += nb_workers_tree;
															
 
																+		nb_workers_total -= nb_workers_tree;
															
 
																+		trees[tree_id].nb_workers = 0;
															
 
																+	    }
															
 
																+	}
															
 
																+
															
 
																+	/* Then we merge too small subtrees into not too large ones
															
 
																+	 * if we manage to merge some subtrees we turn the flag
															
 
																+	 * complete to 0 thus we know he have to start again to assign
															
 
																+	 * just merged subtrees */
															
 
																+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
															
 
																+	{
															
 
																+	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
															
 
																+		continue;
															
 
																+
															
 
																+	    nb_workers_tree = trees[tree_id].nb_workers;
															
 
																+
															
 
																+	    /* We go through the array to find another subtree we can merge with this one */
															
 
																+	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
															
 
																+	    {
															
 
																+		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
															
 
																+		    continue;
															
 
																+
															
 
																+		nb_workers_tree2 = trees[tree_id2].nb_workers;
															
 
																+
															
 
																+		/*  We can merge the two subtrees, let's do it */
															
 
																+		if(nb_workers_tree + nb_workers_tree2 <= max_size)
															
 
																 		{
															
 
																-			worker_array[*worker_cnt] = worker->workerid;
															
 
																-			*worker_cnt = *worker_cnt + 1;
															
 
																+		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
															
 
																+			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
															
 
																+
															
 
																+		    trees[tree_id].nb_workers += nb_workers_tree2;
															
 
																+		    trees[tree_id2].nb_workers = 0;
															
 
																+
															
 
																+		    /* We just merged two subtrees, we need to restart again and try to assign it */
															
 
																+		    complete = 0;
															
 
																+		    break;
															
 
																 		}
															
 
																+	    }
															
 
																-		/* We cannot create a combined worker only if there is a CPU
															
 
																-		 * worker. */
															
 
																-		return (!worker || worker->arch == STARPU_CPU_WORKER);
															
 
																+	    if(!complete)
															
 
																+		break;
															
 
																 	}
															
 
																+    }
															
 
																-	/* If there is only one child, we go to the next level directly */
															
 
																-	if (obj->arity == 1)
															
 
																-		return find_combinations_with_hwloc_rec(obj->children[0], worker_array, worker_cnt);
															
 
																+    return nb_workers_assigned;
															
 
																+}
															
 
																-	/* We recursively go from the root to the leaves of the tree to find
															
 
																-	 * subtrees that only have CPUs as leaves. */
															
 
																-	unsigned cpu_children_cnt = 0;
															
 
																+/* find_and_assign_combinations_with_hwloc_recursive
															
 
																+ * =================================================
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
															
 
																+ * assign are given back to the calling function to be assigned using data from other subtrees if so.
															
 
																+ *
															
 
																+ * Return value
															
 
																+ * ============
															
 
																+ * The number of workers left to be assigned.
															
 
																+ *
															
 
																+ * Arguments
															
 
																+ * =========
															
 
																+ * tree			(input, output)
															
 
																+ *			Tree structure containing the root to process in entry.
															
 
																+ *			When the function returns it also contains the number of workers left
															
 
																+ *			to be assigned and these very workers in the array previously allocated.
															
 
																+ *
															
 
																+ * min_size		(input)
															
 
																+ *			Minimum size of a combined worker.
															
 
																+ *
															
 
																+ * max_size		(input)
															
 
																+ *			Maximum size of a combined worker.
															
 
																+ */
															
 
																+
															
 
																+static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
															
 
																+{
															
 
																+    unsigned subtree_id, nb_workers = 0;
															
 
																-	int worker_array_rec[STARPU_NMAXWORKERS];
															
 
																-	int worker_cnt_rec = 0;
															
 
																-	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
															
 
																+    hwloc_obj_t obj = tree->obj;
															
 
																+    int *workers = tree->workers;
															
 
																-	unsigned i;
															
 
																-	for (i = 0; i < obj->arity; i++)
															
 
																+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+
															
 
																+    /* Is this a leaf ? (eg. a PU for hwloc) */
															
 
																+    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
															
 
																+    {
															
 
																+	struct starpu_worker_s *worker = obj->userdata;
															
 
																+
															
 
																+	/* If this is a CPU worker add it at the beginning
															
 
																+	 * of the array , write 1 in the field nb_workers and
															
 
																+	 * return the number of CPU workers found : 1 in this case. */
															
 
																+	if (worker && worker->arch == STARPU_CPU_WORKER)
															
 
																 	{
															
 
																-		int valid_subtree = find_combinations_with_hwloc_rec(obj->children[i],
															
 
																-						worker_array_rec, &worker_cnt_rec);
															
 
																-		if (valid_subtree)
															
 
																-			cpu_children_cnt++;
															
 
																+	    workers[0] = worker->workerid;
															
 
																+	    tree->nb_workers = 1;
															
 
																+	    return 1;
															
 
																 	}
															
 
																-	int child;
															
 
																+	tree->nb_workers = 0;
															
 
																+	return 0;
															
 
																+    }
															
 
																+
															
 
																+
															
 
																+    /* If there is only one child, we go to the next level right away */
															
 
																+    if (obj->arity == 1)
															
 
																+    {
															
 
																+	tree_t subtree = *tree;
															
 
																+	subtree.obj = obj->children[0];
															
 
																+	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
															
 
																+	tree->nb_workers = nb_workers;
															
 
																+	return nb_workers;
															
 
																+    }
															
 
																+
															
 
																+    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
															
 
																+     * CPU leaves that fits between min and max. */
															
 
																+
															
 
																+    /* We allocate an array of tree structures which will contain the current node's subtrees data */
															
 
																+    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
															
 
																+
															
 
																+    /* We allocate the array containing the workers of each subtree and initialize the fields left */
															
 
																+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
															
 
																+    {
															
 
																+	tree_t *subtree = subtrees + subtree_id;
															
 
																+
															
 
																+	subtree->obj = obj->children[subtree_id];
															
 
																+	subtree->nb_workers = 0;
															
 
																+	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
															
 
																+    }
															
 
																+
															
 
																+    /* We recursively go through every subtree and get all the workers which are not assigned yet */
															
 
																+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
															
 
																+	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
															
 
																+
															
 
																+    if(nb_workers > max_size)
															
 
																+    {
															
 
																+	/* We withdraw the number of workers just assigned from the total number of workers */
															
 
																+	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
															
 
																-	if (cpu_children_cnt == obj->arity)
															
 
																-	for (child = 0; child < worker_cnt_rec; child++)
															
 
																+	/* Some workers are not assigned yet : we gather them in the array
															
 
																+	 * which is returned to the father which will handle them later */
															
 
																+	if(nb_workers)
															
 
																+	    gather_trees(tree, subtrees, obj->arity);
															
 
																+    }
															
 
																+    else if(nb_workers < max_size)
															
 
																+    {
															
 
																+	gather_trees(tree, subtrees, obj->arity);
															
 
																+    }
															
 
																+    else // nb_workers == max_size
															
 
																+    {
															
 
																+	gather_trees(tree, subtrees, obj->arity);
															
 
																+
															
 
																+	int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
															
 
																+	STARPU_ASSERT(ret >= 0);
															
 
																+	nb_workers = 0;
															
 
																+    }
															
 
																+
															
 
																+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
															
 
																+	free(subtrees[subtree_id].workers);
															
 
																+    free(subtrees);
															
 
																+
															
 
																+    tree->nb_workers = nb_workers;
															
 
																+    return nb_workers;
															
 
																+}
															
 
																+
															
 
																+/* get_min_max_sizes
															
 
																+ * =================================================
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
															
 
																+ * environment variables.
															
 
																+ * If both of them were not set, then we try do get some efficient values following the rule beneath :
															
 
																+ *
															
 
																+ * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
															
 
																+ * S = square_root(nb_cpus)
															
 
																+ *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
															
 
																+ *
															
 
																+ * If only one of both was not set then we set it with a value relative to the other, for example :
															
 
																+ *
															
 
																+ *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
															
 
																+ *
															
 
																+ * Arguments
															
 
																+ * =========
															
 
																+ * min_size		(output)
															
 
																+ *			Pointer to the minimum size of a combined worker, whether set with
															
 
																+ *			value given by the user or processed from the number of cpus.
															
 
																+ *
															
 
																+ * max_size		(output)
															
 
																+ *			Pointer to the maximum size of a combined worker, whether set with
															
 
																+ *			value given by the user or processed from the number of cpus.
															
 
																+ *
															
 
																+ * topology		(input)
															
 
																+ *			Topology of the machine : used to know the number of cpus.
															
 
																+ */
															
 
																+
															
 
																+static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
															
 
																+{
															
 
																+    int _min_size, _max_size;
															
 
																+
															
 
																+    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
															
 
																+    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
															
 
																+
															
 
																+    /* If the user didn't set both the environment variables,
															
 
																+     * we need to find a minimum and a maximum size ourselves */
															
 
																+    if(_min_size <= -1 || _max_size <= -1)
															
 
																+    {
															
 
																+
															
 
																+	int nb_cpus = topology->nhwcpus;
															
 
																+	int sqrt_nb_cpus = sqrt(nb_cpus);
															
 
																+	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
															
 
																+
															
 
																+	    if(_min_size == -1)
															
 
																+	    {
															
 
																+		if(_max_size > -1)
															
 
																+		    _min_size = _max_size - 1;
															
 
																+		else
															
 
																+		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
															
 
																+	    }
															
 
																+
															
 
																+	if(_max_size == -1)
															
 
																 	{
															
 
																-		worker_array[*worker_cnt] = worker_array_rec[child];
															
 
																-		*worker_cnt = *worker_cnt + 1;
															
 
																+	    if(_min_size > -1)
															
 
																+		_max_size = _min_size + 1;
															
 
																+	    else
															
 
																+		_max_size = sqrt_nb_cpus + 1;
															
 
																 	}
															
 
																-	
															
 
																-	/* If there is at least 2 children that are valid, we combined them. */
															
 
																-	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
															
 
																-	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
															
 
																+    }
															
 
																-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
															
 
																-		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
															
 
																+    *min_size = _min_size;
															
 
																+    *max_size = _max_size;
															
 
																-	return (cpu_children_cnt == obj->arity);
															
 
																+    return;
															
 
																 }
															
 
																-static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
															
 
																+/* find_and_assign_combinations_with_hwloc
															
 
																+ * =======================================
															
 
																+ * Purpose
															
 
																+ * =======
															
 
																+ * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
															
 
																+ * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
															
 
																+ * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
															
 
																+ * some workers, we assign them no matter the number for there is no way to respect
															
 
																+ * the wanted sizes anymore.
															
 
																+ *
															
 
																+ * Arguments
															
 
																+ * =========
															
 
																+ * topology		(input)
															
 
																+ *			Topology of the machine : used to know the number of cpus and
															
 
																+ *			to get the hwloc tree.
															
 
																+ */
															
 
																+
															
 
																+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
															
 
																 {
															
 
																-	/* We don't care about the result */
															
 
																-	int worker_array[STARPU_NMAXWORKERS];
															
 
																-	int worker_cnt = 0;
															
 
																-
															
 
																-	/* We recursively go from the root to the leaves of the tree to find
															
 
																-	 * subtrees that only have CPUs as leaves. */
															
 
																-	hwloc_obj_t root;
															
 
																-	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
															
 
																-	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
															
 
																+    unsigned nb_workers;
															
 
																+    int min_size, max_size;
															
 
																+
															
 
																+    get_min_max_sizes(&min_size, &max_size, topology);
															
 
																+
															
 
																+    STARPU_ASSERT(min_size <= max_size);
															
 
																+
															
 
																+    tree_t tree;
															
 
																+
															
 
																+    /* Of course we start from the root */
															
 
																+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
															
 
																+    tree.nb_workers = 0;
															
 
																+    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
															
 
																+
															
 
																+    /* We recursively go from the root to the leaves of the tree to find
															
 
																+     * subtrees that only have CPUs as leaves. */
															
 
																+    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
															
 
																+
															
 
																+    /* There are still some workers left, since the only possibility is that
															
 
																+     * the number of workers left is less than the minimum worker size we assign them all */
															
 
																+    if(nb_workers > 0)
															
 
																+    {
															
 
																+	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
															
 
																+	 * while there are enough workers to assign regarding the min_size value */
															
 
																+	STARPU_ASSERT(nb_workers < max_size);
															
 
																+
															
 
																+	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
															
 
																+	STARPU_ASSERT(ret >= 0);
															
 
																+    }
															
 
																+
															
 
																+    free(tree.workers);
															
 
																 }
															
 
																-#else
															
 
																+#else /* STARPU_HAVE_HWLOC */
															
 
																-static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
															
 
																+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
															
 
																 {
															
 
																-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	/* We put the id of all CPU workers in this array */
															
 
																-	int cpu_workers[STARPU_NMAXWORKERS];
															
 
																-	unsigned ncpus = 0;
															
 
																+    /* We put the id of all CPU workers in this array */
															
 
																+    int cpu_workers[STARPU_NMAXWORKERS];
															
 
																+    unsigned ncpus = 0;
															
 
																-	unsigned i;
															
 
																-	for (i = 0; i < topology->nworkers; i++)
															
 
																-	{
															
 
																-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
															
 
																-			cpu_workers[ncpus++] = i;
															
 
																-	}
															
 
																-	
															
 
																-	unsigned size;
															
 
																-	for (size = 2; size <= ncpus; size *= 2)
															
 
																+    unsigned i;
															
 
																+    for (i = 0; i < topology->nworkers; i++)
															
 
																+    {
															
 
																+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
															
 
																+	    cpu_workers[ncpus++] = i;
															
 
																+    }
															
 
																+
															
 
																+    unsigned size;
															
 
																+    for (size = 2; size <= ncpus; size *= 2)
															
 
																+    {
															
 
																+	unsigned first_cpu;
															
 
																+	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
															
 
																 	{
															
 
																-		unsigned first_cpu;
															
 
																-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
															
 
																-		{
															
 
																-			if (first_cpu + size <= ncpus)
															
 
																-			{
															
 
																-				int workerids[size];
															
 
																-
															
 
																-				for (i = 0; i < size; i++)
															
 
																-					workerids[i] = cpu_workers[first_cpu + i];
															
 
																-
															
 
																-				/* We register this combination */
															
 
																-				int ret;
															
 
																-				ret = starpu_combined_worker_assign_workerid(size, workerids); 
															
 
																-				STARPU_ASSERT(ret >= 0);
															
 
																-			}
															
 
																-		}
															
 
																+	    if (first_cpu + size <= ncpus)
															
 
																+	    {
															
 
																+		int workerids[size];
															
 
																+
															
 
																+		for (i = 0; i < size; i++)
															
 
																+		    workerids[i] = cpu_workers[first_cpu + i];
															
 
																+
															
 
																+		/* We register this combination */
															
 
																+		int ret;
															
 
																+		ret = starpu_combined_worker_assign_workerid(size, workerids); 
															
 
																+		STARPU_ASSERT(ret >= 0);
															
 
																+	    }
															
 
																 	}
															
 
																+    }
															
 
																 }
															
 
																-#endif
															
 
																+
															
 
																+#endif /* STARPU_HAVE_HWLOC */
															
 
																 static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
															
 
																 {
															
 
																-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
															
 
																-	int cpu_workers[STARPU_NMAXWORKERS];
															
 
																-	unsigned ncpus = 0;
															
 
																+    int cpu_workers[STARPU_NMAXWORKERS];
															
 
																+    unsigned ncpus = 0;
															
 
																-	unsigned i;
															
 
																-	for (i = 0; i < topology->nworkers; i++)
															
 
																-	{
															
 
																-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
															
 
																-			cpu_workers[ncpus++] = i;
															
 
																-	}
															
 
																+    unsigned i;
															
 
																+    for (i = 0; i < topology->nworkers; i++)
															
 
																+    {
															
 
																+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
															
 
																+	    cpu_workers[ncpus++] = i;
															
 
																+    }
															
 
																-	if (ncpus > 0)
															
 
																-	{
															
 
																-		int ret;
															
 
																-		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
															
 
																-		STARPU_ASSERT(ret >= 0);
															
 
																-	}
															
 
																+    if (ncpus > 0)
															
 
																+    {
															
 
																+	int ret;
															
 
																+	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
															
 
																+	STARPU_ASSERT(ret >= 0);
															
 
																+    }
															
 
																 }
															
 
																 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
															
@@ -175,9 +479,9 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 
																 	combine_all_cpu_workers(topology);
															
 
																     else {
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																-	find_combinations_with_hwloc(topology);
															
 
																+	find_and_assign_combinations_with_hwloc(topology);
															
 
																 #else
															
 
																-	find_combinations_without_hwloc(topology);
															
 
																+	find_and_assign_combinations_without_hwloc(topology);
															
 
																 #endif
															
 
																     }
															
 
																 }
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -121,6 +121,7 @@ noinst_PROGRAMS =				\
 
																 	datawizard/acquire_cb_insert		\
															
 
																 	datawizard/acquire_release		\
															
 
																 	datawizard/acquire_release2		\
															
 
																+	datawizard/copy				\
															
 
																 	datawizard/data_implicit_deps		\
															
 
																 	datawizard/data_lookup			\
															
 
																 	datawizard/scratch			\
															
--- a/tests/datawizard/copy.c
+++ b/tests/datawizard/copy.c
@@ -0,0 +1,102 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2010-2011  Université de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																+static unsigned nloops = 1000;
															
 
																+
															
 
																+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
															
 
																+{
															
 
																+}
															
 
																+
															
 
																+static starpu_codelet cpu_codelet =
															
 
																+{
															
 
																+        .where = STARPU_CPU,
															
 
																+        .cpu_func = dummy_func,
															
 
																+        .model = NULL,
															
 
																+        .nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+static starpu_codelet gpu_codelet =
															
 
																+{
															
 
																+        .where = STARPU_CUDA|STARPU_OPENCL,
															
 
																+        .cuda_func = dummy_func,
															
 
																+        .opencl_func = dummy_func,
															
 
																+        .model = NULL,
															
 
																+        .nbuffers = 1
															
 
																+};
															
 
																+
															
 
																+
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+        float foo;
															
 
																+	starpu_data_handle float_array_handle;
															
 
																+        int i;
															
 
																+
															
 
																+        starpu_init(NULL);
															
 
																+
															
 
																+	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
															
 
																+	{
															
 
																+		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																+        foo = 0.0f;
															
 
																+	starpu_variable_data_register(&float_array_handle, 0, (uintptr_t)&foo, sizeof(foo));
															
 
																+
															
 
																+        for (i = 0; i < nloops; i++)
															
 
																+        {
															
 
																+		struct starpu_task *task_cpu, *task_gpu;
															
 
																+                int ret;
															
 
																+
															
 
																+		task_cpu = starpu_task_create();
															
 
																+		task_gpu = starpu_task_create();
															
 
																+
															
 
																+		task_cpu->cl = &cpu_codelet;
															
 
																+		task_cpu->callback_func = NULL;
															
 
																+		task_cpu->buffers[0].handle = float_array_handle;
															
 
																+		task_cpu->buffers[0].mode = STARPU_RW;
															
 
																+
															
 
																+		task_gpu->cl = &gpu_codelet;
															
 
																+		task_gpu->callback_func = NULL;
															
 
																+		task_gpu->buffers[0].handle = float_array_handle;
															
 
																+		task_gpu->buffers[0].mode = STARPU_RW;
															
 
																+
															
 
																+		ret = starpu_task_submit(task_cpu);
															
 
																+		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+		{
															
 
																+			FPRINTF(stderr, "No worker may execute this task\n");
															
 
																+			exit(0);
															
 
																+		}
															
 
																+
															
 
																+		ret = starpu_task_submit(task_gpu);
															
 
																+		if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+		{
															
 
																+			FPRINTF(stderr, "No worker may execute this task\n");
															
 
																+			exit(0);
															
 
																+		}
															
 
																+        }
															
 
																+
															
 
																+	starpu_task_wait_for_all();
															
 
																+	starpu_data_unregister(float_array_handle);
															
 
																+        starpu_shutdown();
															
 
																+
															
 
																+        return 0;
															
 
																+}
															
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -23,6 +23,8 @@
 
																 starpu_data_handle fork_handles[N];
															
 
																 unsigned forks[N];
															
 
																+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
															
 
																+
															
 
																 static void eat_kernel(void *descr[], void *arg)
															
 
																 {
															
 
																 }
															
@@ -78,6 +80,12 @@ int main(int argc, char **argv)
 
																 	starpu_task_wait_for_all();
															
 
																+	FPRINTF(stderr, "waiting done\n");
															
 
																+	for (f = 0; f < N; f++)
															
 
																+	{
															
 
																+		starpu_data_unregister(fork_handles[f]);
															
 
																+	}
															
 
																+
															
 
																 	starpu_shutdown();
															
 
																 	return 0;