преди 14 години · 482baecdf7
--- a/Makefile.am
+++ b/Makefile.am
@@ -65,13 +65,15 @@ if BUILD_STARPU_TOP
 
				 all-local:
			
 
				 	cd starpu-top ; $(QMAKE) ; $(MAKE)
			
 
				 clean-local:
			
 
				-	cd starpu-top ; $(MAKE) clean
			
 
				+	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
			
 
				 # TODO: resources
			
 
				 install-exec-local:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(bindir)
			
 
				 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
			
 
				 uninstall-local:
			
 
				 	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
			
 
				+	$(RM) starpu-top/StarPU-Top
			
 
				+	$(RM) starpu-top/Makefile
			
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_WINDOWS
			
--- a/configure.ac
+++ b/configure.ac
@@ -594,9 +594,11 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 
				     	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
			
 
				         if test "$have_valid_opencl" = "no" ; then
			
 
				             for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
			
 
				-    	        STARPU_CHECK_OPENCL($f, "no", "no")
			
 
				-                if test "$have_valid_opencl" = "yes" ; then
			
 
				-                    break
			
 
				+                if test -n $f ; then
			
 
				+    	            STARPU_CHECK_OPENCL($f, "no", "no")
			
 
				+                    if test "$have_valid_opencl" = "yes" ; then
			
 
				+                        break
			
 
				+                    fi
			
 
				                 fi
			
 
				             done
			
 
				         fi
			
@@ -1389,24 +1391,11 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 
				 # File configuration
			
 
				 AC_CONFIG_COMMANDS([executable-scripts], [
			
 
				   chmod +x tests/regression/regression.sh
			
 
				+  chmod +x gcc-plugin/tests/run-test
			
 
				 ])
			
 
				 
			
 
				 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
			
 
				-AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
			
 
				-
			
 
				-if test $build_gcc_plugin == "yes" ; then
			
 
				-    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
			
 
				-    AC_OUTPUT([
			
 
				-	    gcc-plugin/Makefile
			
 
				-	    gcc-plugin/src/Makefile
			
 
				-	    gcc-plugin/tests/Makefile
			
 
				-	    gcc-plugin/tests/run-test
			
 
				-	    gcc-plugin/examples/Makefile
			
 
				-            ])
			
 
				-    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
			
 
				-            chmod +x gcc-plugin/tests/run-test
			
 
				-            ])
			
 
				-fi 
			
 
				+AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/src/starpu-gcc-config.h)
			
 
				 
			
 
				 AC_OUTPUT([
			
 
				 	Makefile
			
@@ -1425,6 +1414,11 @@ AC_OUTPUT([
 
				 	doc/Makefile
			
 
				 	mpi/Makefile
			
 
				 	starpu-top/StarPU-Top.pro
			
 
				+        gcc-plugin/Makefile
			
 
				+	gcc-plugin/src/Makefile
			
 
				+	gcc-plugin/tests/Makefile
			
 
				+	gcc-plugin/tests/run-test
			
 
				+	gcc-plugin/examples/Makefile
			
 
				 ])
			
 
				 
			
 
				 AC_MSG_NOTICE([
			
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -409,7 +409,7 @@ configuration:
 
				 * Setting flags for compiling and linking applications::  
			
 
				 * Running a basic StarPU application::  
			
 
				 * Kernel threads started by StarPU::
			
 
				-* Using accelerators::          
			
 
				+* Enabling OpenCL::
			
 
				 @end menu
			
 
				 
			
 
				 @node Setting flags for compiling and linking applications
			
@@ -462,14 +462,20 @@ installed. This step is done only once per user and per machine.
 
				 @node Kernel threads started by StarPU
			
 
				 @section Kernel threads started by StarPU
			
 
				 
			
 
				-TODO: StarPU starts one thread per CPU core and binds them there, uses one of
			
 
				-them per GPU. The application is not supposed to do computations in its own
			
 
				-threads. TODO: add a StarPU function to bind an application thread (e.g. the
			
 
				-main thread) to a dedicated core (and thus disable the corresponding StarPU CPU
			
 
				-worker).
			
 
				+StarPU automatically binds one thread per CPU core. It does not use
			
 
				+SMT/hyperthreading because kernels are usually already optimized for using a
			
 
				+full core, and using hyperthreading would make kernel calibration rather random.
			
 
				 
			
 
				-@node Using accelerators
			
 
				-@section Using accelerators
			
 
				+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
			
 
				+
			
 
				+While StarPU tasks are executing, the application is not supposed to do
			
 
				+computations in the threads it starts itself, tasks should be used instead.
			
 
				+
			
 
				+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
			
 
				+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
			
 
				+
			
 
				+@node Enabling OpenCL
			
 
				+@section Enabling OpenCL
			
 
				 
			
 
				 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
			
 
				 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
			
@@ -477,8 +483,26 @@ This design choice was necessary as OpenCL and CUDA can not run at the
 
				 same time on the same NVIDIA GPU, as there is currently no interoperability
			
 
				 between them.
			
 
				 
			
 
				-Details on how to specify devices running OpenCL and the ones running
			
 
				-CUDA are given in @ref{Enabling OpenCL}.
			
 
				+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
			
 
				+
			
 
				+@example
			
 
				+% ./configure --disable-cuda
			
 
				+@end example
			
 
				+
			
 
				+or when running applications:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=0 ./application
			
 
				+@end example
			
 
				+
			
 
				+OpenCL will automatically be started on any device not yet used by
			
 
				+CUDA. So on a machine running 4 GPUS, it is therefore possible to
			
 
				+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
			
 
				+so:
			
 
				+
			
 
				+@example
			
 
				+% STARPU_NCUDA=2 ./application
			
 
				+@end example
			
 
				 
			
 
				 
			
 
				 @c ---------------------------------------------------------------------
			
@@ -1110,7 +1134,7 @@ or for example, by disabling CPU devices:
 
				 @end smallexample
			
 
				 
			
 
				 or by disabling CUDA devices (which may permit to enable the use of OpenCL,
			
 
				-see @ref{Using accelerators}):
			
 
				+see @ref{Enabling OpenCL}):
			
 
				 
			
 
				 @smallexample
			
 
				 % STARPU_NCUDA=0 ./vector_scal
			
@@ -2153,7 +2177,9 @@ very natural way by the means of asynchronous interactions between the
 
				 application and StarPU.  This is implemented in a separate libstarpumpi library
			
 
				 which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
			
 
				 @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
			
 
				-GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.
			
 
				+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
			
 
				+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
			
 
				+the different MPI nodes.
			
 
				 
			
 
				 @menu
			
 
				 * The API::                     
			
@@ -3909,11 +3935,13 @@ This partitions a vector into blocks of the same size.
 
				 
			
 
				 
			
 
				 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a vector into blocks of sizes given in @var{filter_arg_ptr}.
			
 
				+This partitions a vector into blocks of sizes given in the @var{filter_arg_ptr}
			
 
				+field of @var{f}, supposed to point on a @code{uint32_t*} array.
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				-This partitions a vector into two blocks, the first block size being given in @var{filter_arg}.
			
 
				+This partitions a vector into two blocks, the first block size being given in
			
 
				+the @var{filter_arg} field of @var{f}.
			
 
				 @end deftypefun
			
 
				 
			
 
				 
			
@@ -4601,38 +4629,11 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 
				 @section OpenCL extensions
			
 
				 
			
 
				 @menu
			
 
				-* Enabling OpenCL::            Enabling OpenCL
			
 
				 * Compiling OpenCL kernels::   Compiling OpenCL kernels
			
 
				 * Loading OpenCL kernels::     Loading OpenCL kernels
			
 
				 * OpenCL statistics::          Collecting statistics from OpenCL
			
 
				 @end menu
			
 
				 
			
 
				-@node Enabling OpenCL
			
 
				-@subsection Enabling OpenCL
			
 
				-
			
 
				-On GPU devices which can run both CUDA and OpenCL, CUDA will be
			
 
				-enabled by default. To enable OpenCL, you need either to disable CUDA
			
 
				-when configuring StarPU:
			
 
				-
			
 
				-@example
			
 
				-% ./configure --disable-cuda
			
 
				-@end example
			
 
				-
			
 
				-or when running applications:
			
 
				-
			
 
				-@example
			
 
				-% STARPU_NCUDA=0 ./application
			
 
				-@end example
			
 
				-
			
 
				-OpenCL will automatically be started on any device not yet used by
			
 
				-CUDA. So on a machine running 4 GPUS, it is therefore possible to
			
 
				-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
			
 
				-so:
			
 
				-
			
 
				-@example
			
 
				-% STARPU_NCUDA=2 ./application
			
 
				-@end example
			
 
				-
			
 
				 @node Compiling OpenCL kernels
			
 
				 @subsection Compiling OpenCL kernels
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -112,6 +112,9 @@ noinst_HEADERS = 				\
 
				 	lu/xlu_kernels.h			\
			
 
				 	lu/float.h				\
			
 
				 	lu/double.h				\
			
 
				+	lu/complex_float.h			\
			
 
				+	lu/complex_double.h			\
			
 
				+	lu/blas_complex.h			\
			
 
				 	cholesky/cholesky.h			\
			
 
				 	common/blas_model.h			\
			
 
				 	common/blas.h				\
			
@@ -195,6 +198,14 @@ examplebin_PROGRAMS +=				\
 
				 	cg/cg
			
 
				 endif
			
 
				 
			
 
				+if MKL_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	lu/lu_example_complex_float		\
			
 
				+	lu/lu_example_complex_double		\
			
 
				+	lu/lu_implicit_example_complex_float	\
			
 
				+	lu/lu_implicit_example_complex_double
			
 
				+endif
			
 
				+
			
 
				 if ATLAS_BLAS_LIB
			
 
				 examplebin_PROGRAMS +=				\
			
 
				 	spmv/dw_block_spmv
			
@@ -244,6 +255,14 @@ STARPU_EXAMPLES +=				\
 
				 	cg/cg
			
 
				 endif
			
 
				 
			
 
				+if MKL_BLAS_LIB
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	lu/lu_example_complex_float		\
			
 
				+	lu/lu_example_complex_double		\
			
 
				+	lu/lu_implicit_example_complex_float	\
			
 
				+	lu/lu_implicit_example_complex_double
			
 
				+endif
			
 
				+
			
 
				 if ATLAS_BLAS_LIB
			
 
				 STARPU_EXAMPLES +=				\
			
 
				 	spmv/dw_block_spmv
			
@@ -466,8 +485,56 @@ lu_lu_implicit_example_double_SOURCES =		\
 
				 
			
 
				 lu_lu_implicit_example_double_LDADD =		\
			
 
				 	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+if MKL_BLAS_LIB
			
 
				+lu_lu_example_complex_float_SOURCES =		\
			
 
				+	lu/lu_example_complex_float.c		\
			
 
				+	lu/clu.c				\
			
 
				+	lu/clu_pivot.c				\
			
 
				+	lu/clu_kernels.c			\
			
 
				+	lu/blas_complex.c			\
			
 
				+	common/blas.c
			
 
				+
			
 
				+lu_lu_example_complex_float_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+lu_lu_implicit_example_complex_float_SOURCES =	\
			
 
				+	lu/lu_example_complex_float.c		\
			
 
				+	lu/clu_implicit.c			\
			
 
				+	lu/clu_implicit_pivot.c			\
			
 
				+	lu/clu_kernels.c			\
			
 
				+	lu/blas_complex.c			\
			
 
				+	common/blas.c
			
 
				+
			
 
				+lu_lu_implicit_example_complex_float_LDADD =	\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+lu_lu_example_complex_double_SOURCES =		\
			
 
				+	lu/lu_example_complex_double.c		\
			
 
				+	lu/zlu.c				\
			
 
				+	lu/zlu_pivot.c				\
			
 
				+	lu/zlu_kernels.c			\
			
 
				+	lu/blas_complex.c			\
			
 
				+	common/blas.c
			
 
				+
			
 
				+lu_lu_example_complex_double_LDADD =		\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+lu_lu_implicit_example_complex_double_SOURCES =	\
			
 
				+	lu/lu_example_complex_double.c		\
			
 
				+	lu/zlu_implicit.c			\
			
 
				+	lu/zlu_implicit_pivot.c			\
			
 
				+	lu/zlu_kernels.c			\
			
 
				+	lu/blas_complex.c			\
			
 
				+	common/blas.c
			
 
				+
			
 
				+lu_lu_implicit_example_complex_double_LDADD =	\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				+
			
 
				 ###########################
			
 
				 # 2 Cholesky in 2 ctxs  example #
			
 
				 ###########################
			
--- a/examples/basic_examples/block.c
+++ b/examples/basic_examples/block.c
@@ -122,5 +122,5 @@ int main(int argc, char **argv)
 
				 
			
 
				         starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return (ret!=1);
			
 
				 }
			
--- a/examples/lu/blas_complex.c
+++ b/examples/lu/blas_complex.c
@@ -0,0 +1,214 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <ctype.h>
			
 
				+#include <stdio.h>
			
 
				+#include <complex.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include "blas_complex.h"
			
 
				+
			
 
				+/*
			
 
				+    This files contains BLAS wrappers for the different BLAS implementations
			
 
				+  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
			
 
				+  libraries do not supply C-based ordering.
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_ATLAS
			
 
				+#error not implemented
			
 
				+#elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
			
 
				+#error not implemented
			
 
				+#elif defined(STARPU_MKL)
			
 
				+
			
 
				+inline void CGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+			complex float alpha, complex float *A, int lda, complex float *B, int ldb, 
			
 
				+			complex float beta, complex float *C, int ldc)
			
 
				+{
			
 
				+	cgemm_(transa, transb, &M, &N, &K, &alpha,
			
 
				+			 A, &lda, B, &ldb,
			
 
				+			 &beta, C, &ldc);	
			
 
				+}
			
 
				+
			
 
				+inline void ZGEMM(char *transa, char *transb, int M, int N, int K, 
			
 
				+			complex double alpha, complex double *A, int lda, complex double *B, int ldb, 
			
 
				+			complex double beta, complex double *C, int ldc)
			
 
				+{
			
 
				+	zgemm_(transa, transb, &M, &N, &K, &alpha,
			
 
				+			 A, &lda, B, &ldb,
			
 
				+			 &beta, C, &ldc);	
			
 
				+}
			
 
				+
			
 
				+
			
 
				+inline void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
			
 
				+		complex float *X, int incX, complex float beta, complex float *Y, int incY)
			
 
				+{
			
 
				+	cgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+inline void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
			
 
				+		complex double *X, int incX, complex double beta, complex double *Y, int incY)
			
 
				+{
			
 
				+	zgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+inline float SCASUM(int N, complex float *X, int incX)
			
 
				+{
			
 
				+	return scasum_(&N, X, &incX);
			
 
				+}
			
 
				+
			
 
				+inline double DZASUM(int N, complex double *X, int incX)
			
 
				+{
			
 
				+	return dzasum_(&N, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void CSCAL(int N, complex float alpha, complex float *X, int incX)
			
 
				+{
			
 
				+	cscal_(&N, &alpha, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void ZSCAL(int N, complex double alpha, complex double *X, int incX)
			
 
				+{
			
 
				+	zscal_(&N, &alpha, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void CTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const int m, const int n,
			
 
				+                   const complex float alpha, const complex float *A, const int lda,
			
 
				+                   complex float *B, const int ldb)
			
 
				+{
			
 
				+	ctrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void ZTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const int m, const int n,
			
 
				+                   const complex double alpha, const complex double *A, const int lda,
			
 
				+                   complex double *B, const int ldb)
			
 
				+{
			
 
				+	ztrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void CSYR (const char *uplo, const int n, const complex float alpha,
			
 
				+                  const complex float *x, const int incx, complex float *A, const int lda)
			
 
				+{
			
 
				+	csyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
			
 
				+}
			
 
				+
			
 
				+void CSYRK (const char *uplo, const char *trans, const int n,
			
 
				+                   const int k, const complex float alpha, const complex float *A,
			
 
				+                   const int lda, const complex float beta, complex float *C,
			
 
				+                   const int ldc)
			
 
				+{
			
 
				+	csyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
			
 
				+}
			
 
				+
			
 
				+void CGERU(const int m, const int n, const complex float alpha,
			
 
				+                  const complex float *x, const int incx, const complex float *y,
			
 
				+                  const int incy, complex float *A, const int lda)
			
 
				+{
			
 
				+	cgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				+}
			
 
				+
			
 
				+void ZGERU(const int m, const int n, const complex double alpha,
			
 
				+                  const complex double *x, const int incx, const complex double *y,
			
 
				+                  const int incy, complex double *A, const int lda)
			
 
				+{
			
 
				+	zgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
			
 
				+}
			
 
				+
			
 
				+void CTRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const int n, const complex float *A, const int lda, complex float *x, 
			
 
				+                   const int incx)
			
 
				+{
			
 
				+	ctrsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
			
 
				+}
			
 
				+
			
 
				+void CTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int m, const int n,
			
 
				+                 const complex float alpha, const complex float *A, const int lda,
			
 
				+                 complex float *B, const int ldb)
			
 
				+{
			
 
				+	ctrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void ZTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int m, const int n,
			
 
				+                 const complex double alpha, const complex double *A, const int lda,
			
 
				+                 complex double *B, const int ldb)
			
 
				+{
			
 
				+	ztrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
			
 
				+}
			
 
				+
			
 
				+void CTRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const int n, const complex float *A, const int lda, complex float *X,
			
 
				+                 const int incX)
			
 
				+{
			
 
				+	ctrmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
			
 
				+}
			
 
				+
			
 
				+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incY)
			
 
				+{
			
 
				+	caxpy_(&n, &alpha, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY)
			
 
				+{
			
 
				+	zaxpy_(&n, &alpha, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+int ICAMAX (const int n, complex float *X, const int incX)
			
 
				+{
			
 
				+    int retVal;
			
 
				+    retVal = icamax_ (&n, X, &incX);
			
 
				+    return retVal;
			
 
				+}
			
 
				+
			
 
				+int IZAMAX (const int n, complex double *X, const int incX)
			
 
				+{
			
 
				+    int retVal;
			
 
				+    retVal = izamax_ (&n, X, &incX);
			
 
				+    return retVal;
			
 
				+}
			
 
				+
			
 
				+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy)
			
 
				+{
			
 
				+	complex float retVal = 0;
			
 
				+
			
 
				+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
			
 
				+	retVal = (float)cdotu_(&n, x, &incx, y, &incy);
			
 
				+
			
 
				+	return retVal;
			
 
				+}
			
 
				+
			
 
				+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy)
			
 
				+{
			
 
				+	return zdotu_(&n, x, &incx, y, &incy);
			
 
				+}
			
 
				+
			
 
				+void CSWAP(const int n, complex float *X, const int incX, complex float *Y, const int incY)
			
 
				+{
			
 
				+	cswap_(&n, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+void ZSWAP(const int n, complex double *X, const int incX, complex double *Y, const int incY)
			
 
				+{
			
 
				+	zswap_(&n, X, &incX, Y, &incY);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#else
			
 
				+#error "no BLAS lib available..."
			
 
				+#endif
			
--- a/examples/lu/blas_complex.h
+++ b/examples/lu/blas_complex.h
@@ -0,0 +1,156 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __BLAS_H__
			
 
				+#define __BLAS_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#if defined(STARPU_MKL)
			
 
				+#define MKLcomplex8 complex float
			
 
				+#define MKLcomplex16 complex double
			
 
				+#endif
			
 
				+
			
 
				+void CGEMM(char *transa, char *transb, int M, int N, int K, complex float alpha, complex float *A, int lda, 
			
 
				+		complex float *B, int ldb, complex float beta, complex float *C, int ldc);
			
 
				+void ZGEMM(char *transa, char *transb, int M, int N, int K, complex double alpha, complex double *A, int lda, 
			
 
				+		complex double *B, int ldb, complex double beta, complex double *C, int ldc);
			
 
				+void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
			
 
				+		complex float *X, int incX, complex float beta, complex float *Y, int incY);
			
 
				+void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
			
 
				+		complex double *X, int incX, complex double beta, complex double *Y, int incY);
			
 
				+float SCASUM(int N, complex float *X, int incX);
			
 
				+double DZASUM(int N, complex double *X, int incX);
			
 
				+void CSCAL(int N, complex float alpha, complex float *X, int incX);
			
 
				+void ZSCAL(int N, complex double alpha, complex double *X, int incX);
			
 
				+void CTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const int m, const int n,
			
 
				+                   const complex float alpha, const complex float *A, const int lda,
			
 
				+                   complex float *B, const int ldb);
			
 
				+void ZTRSM (const char *side, const char *uplo, const char *transa,
			
 
				+                   const char *diag, const int m, const int n,
			
 
				+                   const complex double alpha, const complex double *A, const int lda,
			
 
				+                   complex double *B, const int ldb);
			
 
				+void CSYR (const char *uplo, const int n, const complex float alpha,
			
 
				+                  const complex float *x, const int incx, complex float *A, const int lda);
			
 
				+void CSYRK (const char *uplo, const char *trans, const int n,
			
 
				+                   const int k, const complex float alpha, const complex float *A,
			
 
				+                   const int lda, const complex float beta, complex float *C,
			
 
				+                   const int ldc);
			
 
				+void CGERU (const int m, const int n, const complex float alpha,
			
 
				+                  const complex float *x, const int incx, const complex float *y,
			
 
				+                  const int incy, complex float *A, const int lda);
			
 
				+void ZGERU(const int m, const int n, const complex double alpha,
			
 
				+                  const complex double *x, const int incx, const complex double *y,
			
 
				+                  const int incy, complex double *A, const int lda);
			
 
				+void CTRSV (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const int n, const complex float *A, const int lda, complex float *x, 
			
 
				+                   const int incx);
			
 
				+void CTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int m, const int n,
			
 
				+                 const complex float alpha, const complex float *A, const int lda,
			
 
				+                 complex float *B, const int ldb);
			
 
				+void ZTRMM(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int m, const int n,
			
 
				+                 const complex double alpha, const complex double *A, const int lda,
			
 
				+                 complex double *B, const int ldb);
			
 
				+void CTRMV(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const int n, const complex float *A, const int lda, complex float *X,
			
 
				+                 const int incX);
			
 
				+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incy);
			
 
				+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY);
			
 
				+int ICAMAX (const int n, complex float *X, const int incX);
			
 
				+int IZAMAX (const int n, complex double *X, const int incX);
			
 
				+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy);
			
 
				+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy);
			
 
				+void CSWAP(const int n, complex float *x, const int incx, complex float *y, const int incy);
			
 
				+void ZSWAP(const int n, complex double *x, const int incx, complex double *y, const int incy);
			
 
				+
			
 
				+#if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
			
 
				+#error not implemented
			
 
				+#elif defined(STARPU_MKL)
			
 
				+
			
 
				+extern void cgemm_ (const char *transa, const char *transb, const int *m,
			
 
				+                   const int *n, const int *k, const complex float *alpha, 
			
 
				+                   const complex float *A, const int *lda, const complex float *B, 
			
 
				+                   const int *ldb, const complex float *beta, complex float *C, 
			
 
				+                   const int *ldc);
			
 
				+extern void zgemm_ (const char *transa, const char *transb, const int *m,
			
 
				+                   const int *n, const int *k, const complex double *alpha, 
			
 
				+                   const complex double *A, const int *lda, const complex double *B, 
			
 
				+                   const int *ldb, const complex double *beta, complex double *C, 
			
 
				+                   const int *ldc);
			
 
				+extern void cgemv_(const char *trans, int *m, int *n, complex float *alpha,
			
 
				+                   void *a, int *lda, void *x, int *incx, 
			
 
				+                   complex float *beta, void *y, int *incy);
			
 
				+extern void zgemv_(const char *trans, int *m, int *n, complex double *alpha,
			
 
				+                   void *a, int *lda, void *x, int *incx,
			
 
				+                   complex double *beta, void *y, int *incy);
			
 
				+extern void csyr_ (const char *uplo, const int *n, const complex float *alpha,
			
 
				+                  const complex float *x, const int *incx, complex float *A, const int *lda);
			
 
				+extern void csyrk_ (const char *uplo, const char *trans, const int *n,
			
 
				+                   const int *k, const complex float *alpha, const complex float *A,
			
 
				+                   const int *lda, const complex float *beta, complex float *C,
			
 
				+                   const int *ldc);
			
 
				+extern void ctrsm_ (const char *side, const char *uplo, const char *transa, 
			
 
				+                   const char *diag, const int *m, const int *n,
			
 
				+                   const complex float *alpha, const complex float *A, const int *lda,
			
 
				+                   complex float *B, const int *ldb);
			
 
				+extern void ztrsm_ (const char *side, const char *uplo, const char *transa, 
			
 
				+                   const char *diag, const int *m, const int *n,
			
 
				+                   const complex double *alpha, const complex double *A, const int *lda,
			
 
				+                   complex double *B, const int *ldb);
			
 
				+extern complex double scasum_ (const int *n, const complex float *x, const int *incx);
			
 
				+extern complex double dzasum_ (const int *n, const complex double *x, const int *incx);
			
 
				+extern void cscal_ (const int *n, const complex float *alpha, complex float *x,
			
 
				+                   const int *incx);
			
 
				+extern void zscal_ (const int *n, const complex double *alpha, complex double *x,
			
 
				+                   const int *incx);
			
 
				+extern void cgeru_(const int *m, const int *n, const complex float *alpha,
			
 
				+                  const complex float *x, const int *incx, const complex float *y,
			
 
				+                  const int *incy, complex float *A, const int *lda);
			
 
				+extern void zgeru_(const int *m, const int *n, const complex double *alpha,
			
 
				+                  const complex double *x, const int *incx, const complex double *y,
			
 
				+                  const int *incy, complex double *A, const int *lda);
			
 
				+extern void ctrsv_ (const char *uplo, const char *trans, const char *diag, 
			
 
				+                   const int *n, const complex float *A, const int *lda, complex float *x, 
			
 
				+                   const int *incx);
			
 
				+extern void ctrmm_(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int *m, const int *n,
			
 
				+                 const complex float *alpha, const complex float *A, const int *lda,
			
 
				+                 complex float *B, const int *ldb);
			
 
				+extern void ztrmm_(const char *side, const char *uplo, const char *transA,
			
 
				+                 const char *diag, const int *m, const int *n,
			
 
				+                 const complex double *alpha, const complex double *A, const int *lda,
			
 
				+                 complex double *B, const int *ldb);
			
 
				+extern void ctrmv_(const char *uplo, const char *transA, const char *diag,
			
 
				+                 const int *n, const complex float *A, const int *lda, complex float *X,
			
 
				+                 const int *incX);
			
 
				+extern void caxpy_(const int *n, const complex float *alpha, complex float *X, const int *incX,
			
 
				+		complex float *Y, const int *incy);
			
 
				+extern void zaxpy_(const int *n, const complex double *alpha, complex double *X, const int *incX,
			
 
				+		complex double *Y, const int *incy);
			
 
				+extern int icamax_(const int *n, complex float *X, const int *incX);
			
 
				+extern int izamax_(const int *n, complex double *X, const int *incX);
			
 
				+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
			
 
				+extern complex double cdotu_(const int *n, const complex float *x, const int *incx, const complex float *y, const int *incy);
			
 
				+extern complex double zdotu_(const int *n, const complex double *x, const int *incx, const complex double *y, const int *incy);
			
 
				+extern void cswap_(const int *n, complex float *x, const int *incx, complex float *y, const int *incy);
			
 
				+extern void zswap_(const int *n, complex double *x, const int *incx, complex double *y, const int *incy);
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __BLAS_COMPLEX_H__ */
			
--- a/examples/lu/clu.c
+++ b/examples/lu/clu.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "xlu.c"
			
--- a/examples/lu/clu_implicit.c
+++ b/examples/lu/clu_implicit.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "xlu_implicit.c"
			
--- a/examples/lu/clu_implicit_pivot.c
+++ b/examples/lu/clu_implicit_pivot.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "xlu_implicit_pivot.c"
			
--- a/examples/lu/clu_kernels.c
+++ b/examples/lu/clu_kernels.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "xlu_kernels.c"
			
--- a/examples/lu/clu_pivot.c
+++ b/examples/lu/clu_pivot.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "xlu_pivot.c"
			
--- a/examples/lu/complex_double.h
+++ b/examples/lu/complex_double.h
@@ -0,0 +1,52 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include "blas_complex.h"
			
 
				+#define TYPE complex double
			
 
				+#define CUBLAS_TYPE cuDoubleComplex
			
 
				+
			
 
				+#define STARPU_LU(name)       starpu_zlu_##name
			
 
				+#define COMPLEX_LU
			
 
				+
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include <magmablas.h>
			
 
				+#define CUBLAS_GEMM	magmablas_zgemm
			
 
				+#define CUBLAS_TRSM	magmablas_ztrsm
			
 
				+#else
			
 
				+#define CUBLAS_GEMM	cublasZgemm
			
 
				+#define CUBLAS_TRSM	cublasZtrsm
			
 
				+#endif
			
 
				+
			
 
				+#define CUBLAS_SCAL	cublasZscal
			
 
				+#define CUBLAS_GER	cublasZgeru
			
 
				+#define CUBLAS_SWAP	cublasZswap
			
 
				+#define CUBLAS_IAMAX	cublasIzamax
			
 
				+
			
 
				+#define CPU_GEMM	ZGEMM
			
 
				+#define CPU_TRSM	ZTRSM
			
 
				+#define CPU_SCAL	ZSCAL
			
 
				+#define CPU_GER		ZGERU
			
 
				+#define CPU_SWAP	ZSWAP
			
 
				+
			
 
				+#define CPU_TRMM	ZTRMM
			
 
				+#define CPU_AXPY	ZAXPY
			
 
				+#define CPU_ASUM	DZASUM
			
 
				+#define CPU_IAMAX	IZAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-5
			
--- a/examples/lu/complex_float.h
+++ b/examples/lu/complex_float.h
@@ -0,0 +1,52 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+#include <complex.h>
			
 
				+#include "blas_complex.h"
			
 
				+#define TYPE complex float
			
 
				+#define CUBLAS_TYPE cuComplex
			
 
				+
			
 
				+#define STARPU_LU(name)       starpu_clu_##name
			
 
				+#define COMPLEX_LU
			
 
				+
			
 
				+#ifdef STARPU_HAVE_MAGMA
			
 
				+#include <magmablas.h>
			
 
				+#define CUBLAS_GEMM	magmablas_cgemm
			
 
				+#define CUBLAS_TRSM	magmablas_ctrsm
			
 
				+#else
			
 
				+#define CUBLAS_GEMM	cublasCgemm
			
 
				+#define CUBLAS_TRSM	cublasCtrsm
			
 
				+#endif
			
 
				+
			
 
				+#define CUBLAS_SCAL	cublasCscal
			
 
				+#define CUBLAS_GER	cublasCgeru
			
 
				+#define CUBLAS_SWAP	cublasCswap
			
 
				+#define CUBLAS_IAMAX	cublasIcamax
			
 
				+
			
 
				+#define CPU_GEMM	CGEMM
			
 
				+#define CPU_TRSM	CTRSM
			
 
				+#define CPU_SCAL	CSCAL
			
 
				+#define CPU_GER		CGERU
			
 
				+#define CPU_SWAP	CSWAP
			
 
				+
			
 
				+#define CPU_TRMM	CTRMM
			
 
				+#define CPU_AXPY	CAXPY
			
 
				+#define CPU_ASUM	SCASUM
			
 
				+#define CPU_IAMAX	ICAMAX
			
 
				+
			
 
				+#define PIVOT_THRESHHOLD	10e-5
			
--- a/examples/lu/double.h
+++ b/examples/lu/double.h
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #define TYPE double
			
 
				+#define CUBLAS_TYPE TYPE
			
 
				 
			
 
				 #define STARPU_LU(name)       starpu_dlu_##name
			
 
				 
			
--- a/examples/lu/float.h
+++ b/examples/lu/float.h
@@ -17,6 +17,7 @@
 
				 
			
 
				 
			
 
				 #define TYPE float
			
 
				+#define CUBLAS_TYPE TYPE
			
 
				 
			
 
				 #define STARPU_LU(name)       starpu_slu_##name
			
 
				 
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -164,6 +164,10 @@ static void init_matrix(void)
 
				 		for (i = 0; i < size; i++)
			
 
				 		{
			
 
				 			A[i + j*size] = (TYPE)starpu_drand48();
			
 
				+#ifdef COMPLEX_LU
			
 
				+			/* also randomize the imaginary component for complex number cases */
			
 
				+			A[i + j*size] += (TYPE)(I*starpu_drand48());
			
 
				+#endif
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -249,11 +253,20 @@ static void check_result(void)
 
				 	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
			
 
				 	display_matrix(L, size, size, "Residuals");
			
 
				 	
			
 
				+#ifdef COMPLEX_LU
			
 
				+	double err = CPU_ASUM(size*size, L, 1);
			
 
				+	int max = CPU_IAMAX(size*size, L, 1);
			
 
				+	TYPE l_max = L[max];
			
 
				+
			
 
				+	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
			
 
				+	FPRINTF(stderr, "Max error : %e\n", sqrt(creal(l_max)*creal(l_max)+cimag(l_max)*cimag(l_max)));
			
 
				+#else
			
 
				 	TYPE err = CPU_ASUM(size*size, L, 1);
			
 
				 	int max = CPU_IAMAX(size*size, L, 1);
			
 
				 
			
 
				 	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
			
 
				 	FPRINTF(stderr, "Max error : %e\n", L[max]);
			
 
				+#endif
			
 
				 
			
 
				 	double residual = frobenius_norm(L, size);
			
 
				 	double matnorm = frobenius_norm(A_saved, size);
			
--- a/examples/lu/lu_example_complex_double.c
+++ b/examples/lu/lu_example_complex_double.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "lu_example.c"
			
--- a/examples/lu/lu_example_complex_float.c
+++ b/examples/lu/lu_example_complex_float.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_float.h"
			
 
				+#include "lu_example.c"
			
--- a/examples/lu/xlu_kernels.c
+++ b/examples/lu/xlu_kernels.c
@@ -22,6 +22,11 @@
 
				 #define xstr(s)        str(s)
			
 
				 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
			
 
				 
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+static const TYPE p1 =  1.0f;
			
 
				+static const TYPE m1 = -1.0f;
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  *   U22 
			
 
				  */
			
@@ -54,10 +59,10 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
				 			break;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-		case 1:
			
 
				+		case 1: {
			
 
				 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
			
 
				-				(TYPE)-1.0, right, ld21, left, ld12,
			
 
				-				(TYPE)1.0f, center, ld22);
			
 
				+				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
			
 
				+				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
			
 
				 
			
 
				 			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
@@ -67,6 +72,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			break;
			
 
				+		}
			
 
				 #endif
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
@@ -140,7 +146,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
			
 
				-					(TYPE)1.0, sub11, ld11, sub12, ld12);
			
 
				+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
			
 
				 
			
 
				 			status = cublasGetError();
			
 
				 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
			
@@ -221,7 +227,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		case 1:
			
 
				 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
			
 
				-					(TYPE)1.0, sub11, ld11, sub21, ld21);
			
 
				+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
			
 
				 
			
 
				 			status = cublasGetError();
			
 
				 			if (status != CUBLAS_STATUS_SUCCESS)
			
@@ -307,17 +313,19 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				+				TYPE inv_pivot;
			
 
				 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				 				cudaStreamSynchronize(0);
			
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
 
				 				
			
 
				-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+				inv_pivot = 1.0/pivot;
			
 
				+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				-						&sub11[(z+1)+z*ld], 1,
			
 
				-						&sub11[z+(z+1)*ld], ld,
			
 
				-						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				 			}
			
 
				 			
			
 
				 			cudaThreadSynchronize();
			
@@ -423,20 +431,21 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 			for (z = 0; z < nx; z++)
			
 
				 			{
			
 
				 				TYPE pivot;
			
 
				+				TYPE inv_pivot;
			
 
				 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
 
				 				cudaStreamSynchronize(0);
			
 
				 
			
 
				 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
			
 
				 				{
			
 
				 					/* find the pivot */
			
 
				-					int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
			
 
				+					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
			
 
				 	
			
 
				 					ipiv[z + first] = piv_ind + z + first;
			
 
				 
			
 
				 					/* swap if needed */
			
 
				 					if (piv_ind != 0)
			
 
				 					{
			
 
				-						CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
			
 
				+						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
			
 
				 					}
			
 
				 
			
 
				 					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
			
@@ -445,12 +454,13 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
				 
			
 
				 				STARPU_ASSERT(pivot != 0.0);
			
 
				 				
			
 
				-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
			
 
				+				inv_pivot = 1.0/pivot;
			
 
				+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
			
 
				 				
			
 
				-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
			
 
				-						&sub11[(z+1)+z*ld], 1,
			
 
				-						&sub11[z+(z+1)*ld], ld,
			
 
				-						&sub11[(z+1) + (z+1)*ld],ld);
			
 
				+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
			
 
				+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
			
 
				+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
			
 
				+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
			
 
				 				
			
 
				 			}
			
 
				 
			
@@ -534,7 +544,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 
				 				unsigned rowpiv = ipiv[row+first] - first;
			
 
				 				if (rowpiv != row)
			
 
				 				{
			
 
				-					CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
			
 
				+					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
			
 
				 				}
			
 
				 			}
			
 
				 
			
--- a/examples/lu/zlu.c
+++ b/examples/lu/zlu.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "xlu.c"
			
--- a/examples/lu/zlu_implicit.c
+++ b/examples/lu/zlu_implicit.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "xlu_implicit.c"
			
--- a/examples/lu/zlu_implicit_pivot.c
+++ b/examples/lu/zlu_implicit_pivot.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "xlu_implicit_pivot.c"
			
--- a/examples/lu/zlu_kernels.c
+++ b/examples/lu/zlu_kernels.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "xlu_kernels.c"
			
--- a/examples/lu/zlu_pivot.c
+++ b/examples/lu/zlu_pivot.c
@@ -0,0 +1,19 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "complex_double.h"
			
 
				+#include "xlu_pivot.c"
			
--- a/gcc-plugin/examples/Makefile.am
+++ b/gcc-plugin/examples/Makefile.am
@@ -14,12 +14,30 @@
 
				 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				 
			
 
				 noinst_PROGRAMS =				\
			
 
				-  matrix-mult
			
 
				+  matrix-mult stencil5
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+noinst_PROGRAMS +=				\
			
 
				+  cholesky/cholesky
			
 
				+endif
			
 
				 
			
 
				 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
			
 
				 
			
 
				 AM_CPPFLAGS =						\
			
 
				   -I$(top_srcdir)/include				\
			
 
				+  -I$(top_srcdir)/examples				\
			
 
				   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
			
 
				 
			
 
				 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
			
 
				+
			
 
				+if !NO_BLAS_LIB
			
 
				+cholesky_cholesky_SOURCES	=		\
			
 
				+	cholesky/cholesky.c		\
			
 
				+	cholesky/cholesky_models.c	\
			
 
				+	cholesky/cholesky_kernels.c	\
			
 
				+	$(top_srcdir)/examples/common/blas.c
			
 
				+
			
 
				+cholesky_cholesky_LDADD	=	\
			
 
				+	$(STARPU_BLAS_LDFLAGS)
			
 
				+endif
			
 
				+
			
--- a/gcc-plugin/examples/stencil5.c
+++ b/gcc-plugin/examples/stencil5.c
@@ -0,0 +1,127 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+#ifndef STARPU_GCC_PLUGIN
			
 
				+# error must be compiled with the StarPU GCC plug-in
			
 
				+#endif
			
 
				+
			
 
				+/* Definition of the StarPU task and its CPU implementation.  */
			
 
				+static void stencil5(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
			
 
				+	__attribute__ ((task));
			
 
				+
			
 
				+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
			
 
				+	__attribute__ ((task_implementation ("cpu", stencil5)));
			
 
				+
			
 
				+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
			
 
				+{
			
 
				+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
			
 
				+}
			
 
				+
			
 
				+#define NITER_DEF 20000
			
 
				+#define X         10
			
 
				+#define Y         10
			
 
				+
			
 
				+int display = 0;
			
 
				+int niter = NITER_DEF;
			
 
				+
			
 
				+static void parse_args(int argc, char **argv)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 1; i < argc; i++) {
			
 
				+		if (strcmp(argv[i], "-iter") == 0) {
			
 
				+			char *argptr;
			
 
				+			niter = strtol(argv[++i], &argptr, 10);
			
 
				+		}
			
 
				+		if (strcmp(argv[i], "-display") == 0) {
			
 
				+			display = 1;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static float my_rand (void)
			
 
				+{
			
 
				+	return (float) rand () / (float) RAND_MAX;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+        int x, y;
			
 
				+        float mean=0;
			
 
				+        float matrix[X][Y];
			
 
				+
			
 
				+        parse_args(argc, argv);
			
 
				+
			
 
				+	srand (time (NULL));
			
 
				+        for(x = 0; x < X; x++) {
			
 
				+                for (y = 0; y < Y; y++) {
			
 
				+                        matrix[x][y] = my_rand () * 100;
			
 
				+                        mean += matrix[x][y];
			
 
				+                }
			
 
				+        }
			
 
				+        mean /= (x*y);
			
 
				+
			
 
				+        if (display) {
			
 
				+                fprintf(stdout, "mean=%f\n", mean);
			
 
				+                for(x = 0; x < X; x++) {
			
 
				+                        for (y = 0; y < Y; y++) {
			
 
				+                                fprintf(stdout, "%3f ", matrix[x][y]);
			
 
				+                        }
			
 
				+                        fprintf(stdout, "\n");
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+#pragma starpu initialize
			
 
				+
			
 
				+        for(x = 0; x < X; x++) {
			
 
				+		for (y = 0; y < Y; y++) {
			
 
				+#pragma starpu register &matrix[x][y] 1
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	while(niter--) {
			
 
				+                for (x = 1; x < X-1; x++) {
			
 
				+                        for (y = 1; y < Y-1; y++) {
			
 
				+                                stencil5(&matrix[x][y], &matrix[x-1][y], &matrix[x+1][y],
			
 
				+					 &matrix[x][y-1], &matrix[x][y+1]);
			
 
				+                        }
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+#pragma starpu wait
			
 
				+
			
 
				+        for(x = 0; x < X; x++) {
			
 
				+                for (y = 0; y < Y; y++) {
			
 
				+#pragma starpu unregister &matrix[x][y]
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+#pragma starpu shutdown
			
 
				+
			
 
				+        if (display) {
			
 
				+                fprintf(stdout, "mean=%f\n", mean);
			
 
				+                for(x = 0; x < X; x++) {
			
 
				+                        for (y = 0; y < Y; y++) {
			
 
				+                                fprintf(stdout, "%3f ", matrix[x][y]);
			
 
				+                        }
			
 
				+                        fprintf(stdout, "\n");
			
 
				+                }
			
 
				+        }
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
@@ -690,6 +690,12 @@ handle_task_attribute (tree *node, tree name, tree args,
 
				 	      "%<task%> attribute only applies to functions");
			
 
				   else
			
 
				     {
			
 
				+      if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (fn))))
			
 
				+	/* Raise an error but keep going to avoid spitting out too many
			
 
				+	   errors at the user's face.  */
			
 
				+	error_at (DECL_SOURCE_LOCATION (fn),
			
 
				+		  "task return type must be %<void%>");
			
 
				+
			
 
				       /* This is a function declaration for something local to this
			
 
				 	 translation unit, so add the `task' attribute to FN.  */
			
 
				       *no_add_attrs = false;
			
--- a/gcc-plugin/tests/run-test.in
+++ b/gcc-plugin/tests/run-test.in
@@ -261,10 +261,15 @@ unsatisfied directives."
 
				 
			
 
				 (define (executable-file source)
			
 
				   "Return the name of the executable file corresponding to SOURCE."
			
 
				-  (let ((dot (string-rindex source #\.)))
			
 
				-    (if dot
			
 
				-        (substring source 0 dot)
			
 
				-        (string-append source ".exe"))))
			
 
				+  (let* ((dot (string-rindex source #\.))
			
 
				+         (exe (if dot
			
 
				+                  (substring source 0 dot)
			
 
				+                  (string-append source ".exe")))
			
 
				+         )
			
 
				+  (if (string-prefix? %srcdir exe)
			
 
				+      (string-append %builddir (substring exe (string-length %srcdir)))
			
 
				+      exe
			
 
				+      )))
			
 
				 
			
 
				 (define (compile/match file cc cflags ldflags)
			
 
				   "Read directives from FILE, and compiler/link/run it.  Make sure directives
			
--- a/gcc-plugin/tests/task-errors.c
+++ b/gcc-plugin/tests/task-errors.c
@@ -49,6 +49,9 @@ static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string cons
 
				 static void my_task_with_a_body (int foo, char *bar)
			
 
				   __attribute__ ((task, unused));
			
 
				 
			
 
				+extern int my_task_not_void (int foo) /* (error "return type") */
			
 
				+  __attribute__ ((task));
			
 
				+
			
 
				 
			
 
				 static void
			
 
				 my_task_cpu (int foo, float *bar)
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -78,15 +78,13 @@ struct starpu_sched_policy_s {
 
				 
			
 
				 	/* Insert a task into the scheduler. */
			
 
				         int (*push_task)(struct starpu_task *, unsigned);
			
 
				-	/* Notify the scheduler that a task was pushed on the worker. This
			
 
				-	 * method is called when a task that was explicitely assigned to a
			
 
				-	 * worker is scheduled. This method therefore permits to keep the state
			
 
				-	 * of of the scheduler coherent even when StarPU bypasses the
			
 
				-	 * scheduling strategy. */
			
 
				+	/* Notify the scheduler that a task was directly pushed to the worker
			
 
				+	 * without going through the scheduler. This method is called when a
			
 
				+	 * task is explicitely assigned to a worker. This method therefore
			
 
				+	 * permits to keep the timing state of the scheduler coherent even
			
 
				+	 * when StarPU bypasses the scheduling strategy. */
			
 
				 	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned);
			
 
				 
			
 
				-	/* Insert a priority task into the scheduler. */
			
 
				-        int (*push_prio_task)(struct starpu_task *, unsigned);
			
 
				 
			
 
				 	/* Get a task from the scheduler. The mutex associated to the worker is
			
 
				 	 * already taken when this method is called. */
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -429,12 +429,12 @@ static void benchmark_all_gpu_devices(void)
 
				 #endif
			
 
				 
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+	ncpus = config->topology.ncpus;
			
 
				 
			
 
				 	/* TODO: measure bandwidth between GPU-GPU */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-        cudaGetDeviceCount(&ncuda);
			
 
				+	ncuda = _starpu_get_cuda_device_count();
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
 
				 		fprintf(stderr," CUDA %d...", i);
			
@@ -481,7 +481,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 
				 	char hostname[32];
			
 
				 	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				 	if (forced_hostname && forced_hostname[0])
			
 
				-		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
			
 
				 	else
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
@@ -509,11 +509,11 @@ static void load_bus_affinity_file_content(void)
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				-	ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+	ncpus = config->topology.ncpus;
			
 
				         int gpu;
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-        cudaGetDeviceCount(&ncuda);
			
 
				+	ncuda = _starpu_get_cuda_device_count();
			
 
				 	for (gpu = 0; gpu < ncuda; gpu++)
			
 
				 	{
			
 
				 		int ret;
			
@@ -1020,9 +1020,9 @@ static void check_bus_config_file()
 
				                 fclose(f);
			
 
				 
			
 
				                 // Loading current configuration
			
 
				-                ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+                ncpus = config->topology.ncpus;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-                cudaGetDeviceCount(&ncuda);
			
 
				+		ncuda = _starpu_get_cuda_device_count();
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				                 nopencl = _starpu_opencl_get_device_count();
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -204,17 +204,117 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				+static void parse_arch(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history, unsigned archmin, unsigned archmax, int skiparch)
			
 
				 {
			
 
				-	unsigned arch;
			
 
				-	unsigned nimpl;
			
 
				-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
			
 
				-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
			
 
				-			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
			
 
				+	unsigned arch, impl;
			
 
				+	struct starpu_per_arch_perfmodel_t dummy;
			
 
				+	int nimpls, implmax, skipimpl;
			
 
				+	unsigned ret;
			
 
				+	
			
 
				+
			
 
				+	for (arch = archmin; arch < archmax; arch++) {
			
 
				+		_starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\n", &nimpls);
			
 
				+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
			
 
				+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
			
 
				+		for (impl = 0; impl < implmax; impl++) {
			
 
				+			parse_per_arch_model_file(f, &model->per_arch[arch][impl], scan_history);
			
 
				+		}
			
 
				+		if (skipimpl > 0) {
			
 
				+			for (impl = 0; impl < skipimpl; impl++) {
			
 
				+				parse_per_arch_model_file(f, &dummy, 0);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (skiparch > 0) {
			
 
				+		_starpu_drop_comments(f);
			
 
				+		ret = fscanf(f, "%d\n", &nimpls);
			
 
				+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
			
 
				+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
			
 
				+		for (arch = 0; arch < skiparch; arch ++) {
			
 
				+			for (impl = 0; impl < implmax; impl++) {
			
 
				+				parse_per_arch_model_file(f, &dummy, 0);
			
 
				+			}
			
 
				+			if (skipimpl > 0) {
			
 
				+				for (impl = 0; impl < skipimpl; impl++) {
			
 
				+					parse_per_arch_model_file(f, &dummy, 0);
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
			
 
				+{
			
 
				+	unsigned ret;
			
 
				+	unsigned archmin = 0;
			
 
				+	unsigned max_gordondevs = 1; /* XXX : we need a STARPU_MAXGORDONDEVS cst */
			
 
				+	unsigned narchs;
			
 
				+	int nimpls;
			
 
				+
			
 
				+	/* We could probably write a clean loop here, but the code would not
			
 
				+	 * really be easier to read. */
			
 
				+
			
 
				+	/* Parsing CPUs */
			
 
				+	_starpu_drop_comments(f);
			
 
				+	ret = fscanf(f, "%u\n", &narchs);
			
 
				+	STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+	_STARPU_DEBUG("Parsing %u CPUs\n", narchs);
			
 
				+	if (narchs > 0)
			
 
				+	{
			
 
				+		parse_arch(f, model, scan_history,
			
 
				+				archmin,
			
 
				+				STARPU_MIN(narchs, STARPU_MAXCPUS),
			
 
				+				narchs - STARPU_MAXCPUS);
			
 
				+	}
			
 
				+
			
 
				+	/* Parsing CUDA devs */
			
 
				+	_starpu_drop_comments(f);
			
 
				+	ret = fscanf(f, "%u\n", &narchs);
			
 
				+	STARPU_ASSERT(ret == 1);
			
 
				+	_STARPU_DEBUG("Parsing %u CUDA devices\n", narchs);
			
 
				+	if (narchs > 0)
			
 
				+	{
			
 
				+		archmin += STARPU_MAXCPUS;
			
 
				+		parse_arch(f, model, scan_history,
			
 
				+				archmin,
			
 
				+				archmin + STARPU_MIN(narchs, STARPU_MAXCUDADEVS),
			
 
				+				narchs - STARPU_MAXCUDADEVS);
			
 
				+	}
			
 
				+
			
 
				+	/* Parsing OpenCL devs */
			
 
				+	_starpu_drop_comments(f);
			
 
				+	ret = fscanf(f, "%u\n", &narchs);
			
 
				+	STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+	_STARPU_DEBUG("Parsing %u OpenCL devices\n", narchs);
			
 
				+	if (narchs > 0)
			
 
				+	{
			
 
				+		archmin += STARPU_MAXCUDADEVS;
			
 
				+		parse_arch(f, model, scan_history,
			
 
				+				archmin,
			
 
				+				archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
			
 
				+				narchs - STARPU_MAXOPENCLDEVS);
			
 
				+	}
			
 
				+
			
 
				+	/* Parsing Gordon implementations */
			
 
				+	_starpu_drop_comments(f);
			
 
				+	ret = fscanf(f, "%u\n", &narchs);
			
 
				+	STARPU_ASSERT(ret == 1);
			
 
				+
			
 
				+	_STARPU_DEBUG("Parsing %u Gordon devices\n", narchs);
			
 
				+	if (narchs > 0)
			
 
				+	{
			
 
				+		archmin += STARPU_MAXOPENCLDEVS;
			
 
				+		parse_arch(f, model, scan_history,
			
 
				+				archmin,
			
 
				+				archmin + max_gordondevs,
			
 
				+				narchs - max_gordondevs);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 
			
 
				 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
			
 
				 {
			
@@ -235,7 +335,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (nentries == 0)
			
 
				+		return;
			
 
				 	/* header */
			
 
				+	char archname[32];
			
 
				+	starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
			
 
				+	fprintf(f, "# Model for %s\n", archname);
			
 
				 	fprintf(f, "# number of entries\n%u\n", nentries);
			
 
				 
			
 
				 	dump_reg_model(f, model, arch, nimpl);
			
@@ -250,23 +355,123 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 
				 			ptr = ptr->next;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	fprintf(f, "\n##################\n");
			
 
				 }
			
 
				 
			
 
				-static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
			
 
				+static unsigned get_n_entries(struct starpu_perfmodel_t *model, unsigned arch, unsigned impl)
			
 
				 {
			
 
				-	fprintf(f, "#################\n");
			
 
				+	struct starpu_per_arch_perfmodel_t *per_arch_model;
			
 
				+	per_arch_model = &model->per_arch[arch][impl];
			
 
				+	/* count the number of elements in the lists */
			
 
				+	struct starpu_history_list_t *ptr = NULL;
			
 
				+	unsigned nentries = 0;
			
 
				+
			
 
				+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				+	{
			
 
				+		/* Dump the list of all entries in the history */
			
 
				+		ptr = per_arch_model->list;
			
 
				+		while(ptr) {
			
 
				+			nentries++;
			
 
				+			ptr = ptr->next;
			
 
				+		}
			
 
				+	}
			
 
				+	return nentries;
			
 
				+}
			
 
				 
			
 
				+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
			
 
				+{
			
 
				+	unsigned number_of_archs[4] = { 0, 0, 0, 0};
			
 
				 	unsigned arch;
			
 
				 	unsigned nimpl;
			
 
				+	unsigned idx = 0;
			
 
				+
			
 
				+	/* Finding the number of archs to write for each kind of device */
			
 
				 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				 	{
			
 
				+		switch (arch)
			
 
				+		{
			
 
				+			case STARPU_CUDA_DEFAULT:
			
 
				+			case STARPU_OPENCL_DEFAULT:
			
 
				+			case STARPU_GORDON_DEFAULT:
			
 
				+				idx++;
			
 
				+				break;
			
 
				+			default:
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+		unsigned nentries = 0;
			
 
				+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				+		{
			
 
				+			nentries = get_n_entries(model, arch, nimpl) != 0;
			
 
				+			if (nentries > 0)
			
 
				+			{
			
 
				+				number_of_archs[idx]++;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Writing stuff */
			
 
				+	char *name;
			
 
				+	unsigned substract_to_arch = 0;
			
 
				+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
			
 
				+	{
			
 
				+		switch (arch)
			
 
				+		{
			
 
				+			case STARPU_CPU_DEFAULT:
			
 
				+				name = "CPU";
			
 
				+				fprintf(f, "##################\n");
			
 
				+				fprintf(f, "# %ss\n", name);
			
 
				+				fprintf(f, "# number of %s architectures\n", name);
			
 
				+				fprintf(f, "%u\n", number_of_archs[0]);
			
 
				+				break;
			
 
				+			case STARPU_CUDA_DEFAULT:
			
 
				+				name = "CUDA";
			
 
				+				substract_to_arch = STARPU_MAXCPUS;
			
 
				+				fprintf(f, "##################\n");
			
 
				+				fprintf(f, "# %ss\n", name);
			
 
				+				fprintf(f, "# number of %s architectures\n", name);
			
 
				+				fprintf(f, "%u\n", number_of_archs[1]);
			
 
				+				break;
			
 
				+			case STARPU_OPENCL_DEFAULT:
			
 
				+				name = "OPENCL";
			
 
				+				substract_to_arch += STARPU_MAXCUDADEVS;
			
 
				+				fprintf(f, "##################\n");
			
 
				+				fprintf(f, "# %ss\n", name);
			
 
				+				fprintf(f, "# number of %s architectures\n", name);
			
 
				+				fprintf(f, "%u\n", number_of_archs[2]);
			
 
				+				break;
			
 
				+			case STARPU_GORDON_DEFAULT:
			
 
				+				name = "GORDON";
			
 
				+				substract_to_arch += STARPU_MAXOPENCLDEVS;
			
 
				+				fprintf(f, "##################\n");
			
 
				+				fprintf(f, "# %ss\n", name);
			
 
				+				fprintf(f, "# number of %s architectures\n", name);
			
 
				+				fprintf(f, "%u\n", number_of_archs[3]);
			
 
				+				break;
			
 
				+			default:
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
			
 
				 		{
			
 
				-			char archname[32];
			
 
				-			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
			
 
				-			fprintf(f, "# Model for %s\n", archname);
			
 
				+			if (get_n_entries(model, arch, nimpl) == 0)
			
 
				+				break;
			
 
				+
			
 
				+		}
			
 
				+		unsigned max_impl = nimpl;
			
 
				+
			
 
				+		if (max_impl == 0)
			
 
				+			continue;
			
 
				+
			
 
				+		fprintf(f, "###########\n");
			
 
				+		fprintf(f, "# %s_%u\n", name, arch - substract_to_arch);
			
 
				+		fprintf(f, "# number of implementations\n");
			
 
				+		fprintf(f, "%u\n", max_impl);
			
 
				+		for (nimpl = 0; nimpl < max_impl; nimpl++)
			
 
				+		{
			
 
				 			dump_per_arch_model_file(f, model, arch, nimpl);
			
 
				-			fprintf(f, "\n##################\n");
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -300,7 +505,7 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 
				 	char hostname[32];
			
 
				 	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				 	if (forced_hostname && forced_hostname[0])
			
 
				-		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
			
 
				 	else
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
@@ -351,7 +556,7 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 
				 	char hostname[32];
			
 
				 	char *forced_hostname = getenv("STARPU_HOSTNAME");
			
 
				 	if (forced_hostname && forced_hostname[0])
			
 
				-		snprintf(hostname, sizeof(hostname), forced_hostname);
			
 
				+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
			
 
				 	else
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				 	strncat(path, ".", maxlen);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -87,7 +87,7 @@ static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct
 
				 			cl->opencl_funcs[nimpl] == NULL);
			
 
				 	case STARPU_GORDON_WORKER:
			
 
				 		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
			
 
				-			cl->gordon_funcs[nimpl] == NULL);
			
 
				+			cl->gordon_funcs[nimpl] == 0);
			
 
				 	default:
			
 
				 		return 0;
			
 
				 	}
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -357,6 +357,11 @@ void _starpu_opencl_init(void)
 
				                 // Get location of OpenCl kernel source files
			
 
				                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
			
 
				 
			
 
				+		if (nb_devices > STARPU_MAXOPENCLDEVS) {
			
 
				+			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
			
 
				+			nb_devices = STARPU_MAXOPENCLDEVS;
			
 
				+		}
			
 
				+
			
 
				                 // initialise internal structures
			
 
				                 for(i=0 ; i<nb_devices ; i++) {
			
 
				                         contexts[i] = NULL;
			
@@ -538,7 +543,6 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
				 
			
 
				 	struct timespec codelet_start, codelet_end;
			
 
				 
			
 
				-	int workerid = args->workerid;
			
 
				 	STARPU_ASSERT(task);
			
 
				 	struct starpu_codelet_t *cl = task->cl;
			
 
				 	STARPU_ASSERT(cl);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -478,6 +478,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 					) {
			
 
				 				ntasks_best_end = ntasks_end;
			
 
				 				ntasks_best = worker;
			
 
				+
			
 
				 			}
			
 
				 
			
 
				 			if (local_task_length[worker_in_ctx] == -1.0)
			
@@ -492,7 +493,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 				unknown = 1;
			
 
				 
			
 
				 			if (unknown)
			
 
				-				continue;
			
 
				+					continue;
			
 
				 
			
 
				 			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
			
 
				 
			
@@ -501,6 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 
				 				/* a better solution was found */
			
 
				 				best_exp_end = exp_end[worker_in_ctx];
			
 
				 				best_impl = nimpl;
			
 
				+
			
 
				 			}
			
 
				 
			
 
				 			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
			
--- a/src/sched_policies/detect_combined_workers.c
+++ b/src/sched_policies/detect_combined_workers.c
@@ -21,150 +21,454 @@
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 #include <hwloc.h>
			
 
				-#endif
			
 
				 
			
 
				-#ifdef STARPU_HAVE_HWLOC
			
 
				-/* This function returns 1 the subtree induced by obj only contains CPU
			
 
				- * workers, otherwise 0 is returned. This function registers all valid worker
			
 
				- * combination below obj. The id of the CPU workers are put in the worker_array
			
 
				- * and their count is put in the worker_cnt pointer. */
			
 
				-static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array, int *worker_cnt)
			
 
				+/* tree_t
			
 
				+ * ======
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
			
 
				+ * object and storing every workers it contained in every sub-trees by recursion.
			
 
				+ *
			
 
				+ * Fields
			
 
				+ * ======
			
 
				+ * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
			
 
				+ *
			
 
				+ * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
			
 
				+ 			or in this very object.
			
 
				+ *
			
 
				+ * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
			
 
				+ */
			
 
				+
			
 
				+typedef struct tree_s{
			
 
				+    hwloc_obj_t obj;
			
 
				+    unsigned nb_workers;
			
 
				+    int *workers;
			
 
				+} tree_t;
			
 
				+
			
 
				+
			
 
				+/* gather_trees
			
 
				+ * ============
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * Gather all the workers of every source tree in one target tree.
			
 
				+ * We assume the target array of workers is big enough to contain all the workers.
			
 
				+ *
			
 
				+ * Arguments
			
 
				+ * =========
			
 
				+ * target_tree		(input, output)
			
 
				+ *			Pointer to the tree which will contain all the workers of every source.
			
 
				+ *
			
 
				+ * source_trees		(input)
			
 
				+ *			Array of trees we want to combine in a unique tree.
			
 
				+ *
			
 
				+ * nb_source_trees	(input)
			
 
				+ *			Number of trees we want to combine (size of the array).
			
 
				+ */
			
 
				+
			
 
				+static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
			
 
				+{
			
 
				+    unsigned tree_id, worker_id, index = 0;
			
 
				+    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
			
 
				+	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
			
 
				+	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
			
 
				+}
			
 
				+
			
 
				+/* assign_multiple_trees
			
 
				+ * ========================
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * Assign every tree which is large enough (greater than min_size) and merge small ones.
			
 
				+ * If there is no tree large enough to be assigned any more, we return.
			
 
				+ *
			
 
				+ * Return value
			
 
				+ * ============
			
 
				+ * The number of workers assigned during the function.
			
 
				+ *
			
 
				+ * Arguments
			
 
				+ * =========
			
 
				+ * trees		(input, output)
			
 
				+ *			In entry, array of trees to assign. In the end at most one tree still contains workers.
			
 
				+ *
			
 
				+ * nb_trees		(input)
			
 
				+ *			The number of trees (size of the array).
			
 
				+ *
			
 
				+ * min_size		(input)
			
 
				+ *			Minimum size of a combined worker.
			
 
				+ *
			
 
				+ * max_size		(input)
			
 
				+ *			Maximum size of a combined worker.
			
 
				+ */
			
 
				+
			
 
				+static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    unsigned short complete = 0;
			
 
				+    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
			
 
				+
			
 
				+    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
			
 
				+	nb_workers_total += trees[tree_id].nb_workers;;
			
 
				 
			
 
				-	/* Is this a leaf ? (eg. a PU for hwloc) */
			
 
				-	int is_leaf = !hwloc_compare_types(config->cpu_depth, obj->depth);
			
 
				+    while(!complete)
			
 
				+    {
			
 
				+	complete = 1;
			
 
				 
			
 
				-	if (is_leaf)
			
 
				+	/* First we manage to assign every subtree large enough to be assigned alone */
			
 
				+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
			
 
				 	{
			
 
				-		struct starpu_worker_s *worker = obj->userdata;
			
 
				+	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
			
 
				+		continue;
			
 
				 
			
 
				-		/* If this is a CPU worker, append its id at the end of the
			
 
				-		 * list */
			
 
				-		if (worker && worker->arch == STARPU_CPU_WORKER)
			
 
				+	    nb_workers_tree = trees[tree_id].nb_workers;
			
 
				+
			
 
				+	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
			
 
				+	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
			
 
				+	    {
			
 
				+		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
			
 
				+		STARPU_ASSERT(ret >= 0);
			
 
				+		nb_workers_assigned += nb_workers_tree;
			
 
				+		nb_workers_total -= nb_workers_tree;
			
 
				+		trees[tree_id].nb_workers = 0;
			
 
				+	    }
			
 
				+	}
			
 
				+
			
 
				+	/* Then we merge too small subtrees into not too large ones
			
 
				+	 * if we manage to merge some subtrees we turn the flag
			
 
				+	 * complete to 0 thus we know he have to start again to assign
			
 
				+	 * just merged subtrees */
			
 
				+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
			
 
				+	{
			
 
				+	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
			
 
				+		continue;
			
 
				+
			
 
				+	    nb_workers_tree = trees[tree_id].nb_workers;
			
 
				+
			
 
				+	    /* We go through the array to find another subtree we can merge with this one */
			
 
				+	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
			
 
				+	    {
			
 
				+		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
			
 
				+		    continue;
			
 
				+
			
 
				+		nb_workers_tree2 = trees[tree_id2].nb_workers;
			
 
				+
			
 
				+		/*  We can merge the two subtrees, let's do it */
			
 
				+		if(nb_workers_tree + nb_workers_tree2 <= max_size)
			
 
				 		{
			
 
				-			worker_array[*worker_cnt] = worker->workerid;
			
 
				-			*worker_cnt = *worker_cnt + 1;
			
 
				+		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
			
 
				+			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
			
 
				+
			
 
				+		    trees[tree_id].nb_workers += nb_workers_tree2;
			
 
				+		    trees[tree_id2].nb_workers = 0;
			
 
				+
			
 
				+		    /* We just merged two subtrees, we need to restart again and try to assign it */
			
 
				+		    complete = 0;
			
 
				+		    break;
			
 
				 		}
			
 
				+	    }
			
 
				 
			
 
				-		/* We cannot create a combined worker only if there is a CPU
			
 
				-		 * worker. */
			
 
				-		return (!worker || worker->arch == STARPU_CPU_WORKER);
			
 
				+	    if(!complete)
			
 
				+		break;
			
 
				 	}
			
 
				+    }
			
 
				 
			
 
				-	/* If there is only one child, we go to the next level directly */
			
 
				-	if (obj->arity == 1)
			
 
				-		return find_combinations_with_hwloc_rec(obj->children[0], worker_array, worker_cnt);
			
 
				+    return nb_workers_assigned;
			
 
				+}
			
 
				 
			
 
				-	/* We recursively go from the root to the leaves of the tree to find
			
 
				-	 * subtrees that only have CPUs as leaves. */
			
 
				-	unsigned cpu_children_cnt = 0;
			
 
				+/* find_and_assign_combinations_with_hwloc_recursive
			
 
				+ * =================================================
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
			
 
				+ * assign are given back to the calling function to be assigned using data from other subtrees if so.
			
 
				+ *
			
 
				+ * Return value
			
 
				+ * ============
			
 
				+ * The number of workers left to be assigned.
			
 
				+ *
			
 
				+ * Arguments
			
 
				+ * =========
			
 
				+ * tree			(input, output)
			
 
				+ *			Tree structure containing the root to process in entry.
			
 
				+ *			When the function returns it also contains the number of workers left
			
 
				+ *			to be assigned and these very workers in the array previously allocated.
			
 
				+ *
			
 
				+ * min_size		(input)
			
 
				+ *			Minimum size of a combined worker.
			
 
				+ *
			
 
				+ * max_size		(input)
			
 
				+ *			Maximum size of a combined worker.
			
 
				+ */
			
 
				+
			
 
				+static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
			
 
				+{
			
 
				+    unsigned subtree_id, nb_workers = 0;
			
 
				 
			
 
				-	int worker_array_rec[STARPU_NMAXWORKERS];
			
 
				-	int worker_cnt_rec = 0;
			
 
				-	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
			
 
				+    hwloc_obj_t obj = tree->obj;
			
 
				+    int *workers = tree->workers;
			
 
				 
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < obj->arity; i++)
			
 
				+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+
			
 
				+    /* Is this a leaf ? (eg. a PU for hwloc) */
			
 
				+    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
			
 
				+    {
			
 
				+	struct starpu_worker_s *worker = obj->userdata;
			
 
				+
			
 
				+	/* If this is a CPU worker add it at the beginning
			
 
				+	 * of the array , write 1 in the field nb_workers and
			
 
				+	 * return the number of CPU workers found : 1 in this case. */
			
 
				+	if (worker && worker->arch == STARPU_CPU_WORKER)
			
 
				 	{
			
 
				-		int valid_subtree = find_combinations_with_hwloc_rec(obj->children[i],
			
 
				-						worker_array_rec, &worker_cnt_rec);
			
 
				-		if (valid_subtree)
			
 
				-			cpu_children_cnt++;
			
 
				+	    workers[0] = worker->workerid;
			
 
				+	    tree->nb_workers = 1;
			
 
				+	    return 1;
			
 
				 	}
			
 
				 
			
 
				-	int child;
			
 
				+	tree->nb_workers = 0;
			
 
				+	return 0;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    /* If there is only one child, we go to the next level right away */
			
 
				+    if (obj->arity == 1)
			
 
				+    {
			
 
				+	tree_t subtree = *tree;
			
 
				+	subtree.obj = obj->children[0];
			
 
				+	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
			
 
				+	tree->nb_workers = nb_workers;
			
 
				+	return nb_workers;
			
 
				+    }
			
 
				+
			
 
				+    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
			
 
				+     * CPU leaves that fits between min and max. */
			
 
				+
			
 
				+    /* We allocate an array of tree structures which will contain the current node's subtrees data */
			
 
				+    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
			
 
				+
			
 
				+    /* We allocate the array containing the workers of each subtree and initialize the fields left */
			
 
				+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
			
 
				+    {
			
 
				+	tree_t *subtree = subtrees + subtree_id;
			
 
				+
			
 
				+	subtree->obj = obj->children[subtree_id];
			
 
				+	subtree->nb_workers = 0;
			
 
				+	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
			
 
				+    }
			
 
				+
			
 
				+    /* We recursively go through every subtree and get all the workers which are not assigned yet */
			
 
				+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
			
 
				+	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
			
 
				+
			
 
				+    if(nb_workers > max_size)
			
 
				+    {
			
 
				+	/* We withdraw the number of workers just assigned from the total number of workers */
			
 
				+	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
			
 
				 
			
 
				-	if (cpu_children_cnt == obj->arity)
			
 
				-	for (child = 0; child < worker_cnt_rec; child++)
			
 
				+	/* Some workers are not assigned yet : we gather them in the array
			
 
				+	 * which is returned to the father which will handle them later */
			
 
				+	if(nb_workers)
			
 
				+	    gather_trees(tree, subtrees, obj->arity);
			
 
				+    }
			
 
				+    else if(nb_workers < max_size)
			
 
				+    {
			
 
				+	gather_trees(tree, subtrees, obj->arity);
			
 
				+    }
			
 
				+    else // nb_workers == max_size
			
 
				+    {
			
 
				+	gather_trees(tree, subtrees, obj->arity);
			
 
				+
			
 
				+	int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
			
 
				+	STARPU_ASSERT(ret >= 0);
			
 
				+	nb_workers = 0;
			
 
				+    }
			
 
				+
			
 
				+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
			
 
				+	free(subtrees[subtree_id].workers);
			
 
				+    free(subtrees);
			
 
				+
			
 
				+    tree->nb_workers = nb_workers;
			
 
				+    return nb_workers;
			
 
				+}
			
 
				+
			
 
				+/* get_min_max_sizes
			
 
				+ * =================================================
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
			
 
				+ * environment variables.
			
 
				+ * If both of them were not set, then we try do get some efficient values following the rule beneath :
			
 
				+ *
			
 
				+ * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
			
 
				+ * S = square_root(nb_cpus)
			
 
				+ *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
			
 
				+ *
			
 
				+ * If only one of both was not set then we set it with a value relative to the other, for example :
			
 
				+ *
			
 
				+ *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
			
 
				+ *
			
 
				+ * Arguments
			
 
				+ * =========
			
 
				+ * min_size		(output)
			
 
				+ *			Pointer to the minimum size of a combined worker, whether set with
			
 
				+ *			value given by the user or processed from the number of cpus.
			
 
				+ *
			
 
				+ * max_size		(output)
			
 
				+ *			Pointer to the maximum size of a combined worker, whether set with
			
 
				+ *			value given by the user or processed from the number of cpus.
			
 
				+ *
			
 
				+ * topology		(input)
			
 
				+ *			Topology of the machine : used to know the number of cpus.
			
 
				+ */
			
 
				+
			
 
				+static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
			
 
				+{
			
 
				+    int _min_size, _max_size;
			
 
				+
			
 
				+    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				+    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				+
			
 
				+    /* If the user didn't set both the environment variables,
			
 
				+     * we need to find a minimum and a maximum size ourselves */
			
 
				+    if(_min_size <= -1 || _max_size <= -1)
			
 
				+    {
			
 
				+
			
 
				+	int nb_cpus = topology->nhwcpus;
			
 
				+	int sqrt_nb_cpus = sqrt(nb_cpus);
			
 
				+	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
			
 
				+
			
 
				+	    if(_min_size == -1)
			
 
				+	    {
			
 
				+		if(_max_size > -1)
			
 
				+		    _min_size = _max_size - 1;
			
 
				+		else
			
 
				+		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
			
 
				+	    }
			
 
				+
			
 
				+	if(_max_size == -1)
			
 
				 	{
			
 
				-		worker_array[*worker_cnt] = worker_array_rec[child];
			
 
				-		*worker_cnt = *worker_cnt + 1;
			
 
				+	    if(_min_size > -1)
			
 
				+		_max_size = _min_size + 1;
			
 
				+	    else
			
 
				+		_max_size = sqrt_nb_cpus + 1;
			
 
				 	}
			
 
				-	
			
 
				-	/* If there is at least 2 children that are valid, we combined them. */
			
 
				-	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				-	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				+    }
			
 
				 
			
 
				-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
			
 
				-		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
			
 
				+    *min_size = _min_size;
			
 
				+    *max_size = _max_size;
			
 
				 
			
 
				-	return (cpu_children_cnt == obj->arity);
			
 
				+    return;
			
 
				 }
			
 
				 
			
 
				-static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
			
 
				+/* find_and_assign_combinations_with_hwloc
			
 
				+ * =======================================
			
 
				+ * Purpose
			
 
				+ * =======
			
 
				+ * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
			
 
				+ * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
			
 
				+ * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
			
 
				+ * some workers, we assign them no matter the number for there is no way to respect
			
 
				+ * the wanted sizes anymore.
			
 
				+ *
			
 
				+ * Arguments
			
 
				+ * =========
			
 
				+ * topology		(input)
			
 
				+ *			Topology of the machine : used to know the number of cpus and
			
 
				+ *			to get the hwloc tree.
			
 
				+ */
			
 
				+
			
 
				+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				-	/* We don't care about the result */
			
 
				-	int worker_array[STARPU_NMAXWORKERS];
			
 
				-	int worker_cnt = 0;
			
 
				-
			
 
				-	/* We recursively go from the root to the leaves of the tree to find
			
 
				-	 * subtrees that only have CPUs as leaves. */
			
 
				-	hwloc_obj_t root;
			
 
				-	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
			
 
				-	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
			
 
				+    unsigned nb_workers;
			
 
				+    int min_size, max_size;
			
 
				+
			
 
				+    get_min_max_sizes(&min_size, &max_size, topology);
			
 
				+
			
 
				+    STARPU_ASSERT(min_size <= max_size);
			
 
				+
			
 
				+    tree_t tree;
			
 
				+
			
 
				+    /* Of course we start from the root */
			
 
				+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
			
 
				+    tree.nb_workers = 0;
			
 
				+    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
			
 
				+
			
 
				+    /* We recursively go from the root to the leaves of the tree to find
			
 
				+     * subtrees that only have CPUs as leaves. */
			
 
				+    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
			
 
				+
			
 
				+    /* There are still some workers left, since the only possibility is that
			
 
				+     * the number of workers left is less than the minimum worker size we assign them all */
			
 
				+    if(nb_workers > 0)
			
 
				+    {
			
 
				+	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
			
 
				+	 * while there are enough workers to assign regarding the min_size value */
			
 
				+	STARPU_ASSERT(nb_workers < max_size);
			
 
				+
			
 
				+	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
			
 
				+	STARPU_ASSERT(ret >= 0);
			
 
				+    }
			
 
				+
			
 
				+    free(tree.workers);
			
 
				 }
			
 
				 
			
 
				-#else
			
 
				+#else /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				-static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
			
 
				+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 
			
 
				-	/* We put the id of all CPU workers in this array */
			
 
				-	int cpu_workers[STARPU_NMAXWORKERS];
			
 
				-	unsigned ncpus = 0;
			
 
				+    /* We put the id of all CPU workers in this array */
			
 
				+    int cpu_workers[STARPU_NMAXWORKERS];
			
 
				+    unsigned ncpus = 0;
			
 
				 
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < topology->nworkers; i++)
			
 
				-	{
			
 
				-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
			
 
				-			cpu_workers[ncpus++] = i;
			
 
				-	}
			
 
				-	
			
 
				-	unsigned size;
			
 
				-	for (size = 2; size <= ncpus; size *= 2)
			
 
				+    unsigned i;
			
 
				+    for (i = 0; i < topology->nworkers; i++)
			
 
				+    {
			
 
				+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
			
 
				+	    cpu_workers[ncpus++] = i;
			
 
				+    }
			
 
				+
			
 
				+    unsigned size;
			
 
				+    for (size = 2; size <= ncpus; size *= 2)
			
 
				+    {
			
 
				+	unsigned first_cpu;
			
 
				+	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
			
 
				 	{
			
 
				-		unsigned first_cpu;
			
 
				-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
			
 
				-		{
			
 
				-			if (first_cpu + size <= ncpus)
			
 
				-			{
			
 
				-				int workerids[size];
			
 
				-
			
 
				-				for (i = 0; i < size; i++)
			
 
				-					workerids[i] = cpu_workers[first_cpu + i];
			
 
				-
			
 
				-				/* We register this combination */
			
 
				-				int ret;
			
 
				-				ret = starpu_combined_worker_assign_workerid(size, workerids); 
			
 
				-				STARPU_ASSERT(ret >= 0);
			
 
				-			}
			
 
				-		}
			
 
				+	    if (first_cpu + size <= ncpus)
			
 
				+	    {
			
 
				+		int workerids[size];
			
 
				+
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		    workerids[i] = cpu_workers[first_cpu + i];
			
 
				+
			
 
				+		/* We register this combination */
			
 
				+		int ret;
			
 
				+		ret = starpu_combined_worker_assign_workerid(size, workerids); 
			
 
				+		STARPU_ASSERT(ret >= 0);
			
 
				+	    }
			
 
				 	}
			
 
				+    }
			
 
				 }
			
 
				-#endif
			
 
				+
			
 
				+#endif /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				 static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
			
 
				 {
			
 
				-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 
			
 
				-	int cpu_workers[STARPU_NMAXWORKERS];
			
 
				-	unsigned ncpus = 0;
			
 
				+    int cpu_workers[STARPU_NMAXWORKERS];
			
 
				+    unsigned ncpus = 0;
			
 
				 
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < topology->nworkers; i++)
			
 
				-	{
			
 
				-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
			
 
				-			cpu_workers[ncpus++] = i;
			
 
				-	}
			
 
				+    unsigned i;
			
 
				+    for (i = 0; i < topology->nworkers; i++)
			
 
				+    {
			
 
				+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
			
 
				+	    cpu_workers[ncpus++] = i;
			
 
				+    }
			
 
				 
			
 
				-	if (ncpus > 0)
			
 
				-	{
			
 
				-		int ret;
			
 
				-		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
			
 
				-		STARPU_ASSERT(ret >= 0);
			
 
				-	}
			
 
				+    if (ncpus > 0)
			
 
				+    {
			
 
				+	int ret;
			
 
				+	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
			
 
				+	STARPU_ASSERT(ret >= 0);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
			
@@ -175,9 +479,9 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 
				 	combine_all_cpu_workers(topology);
			
 
				     else {
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				-	find_combinations_with_hwloc(topology);
			
 
				+	find_and_assign_combinations_with_hwloc(topology);
			
 
				 #else
			
 
				-	find_combinations_without_hwloc(topology);
			
 
				+	find_and_assign_combinations_without_hwloc(topology);
			
 
				 #endif
			
 
				     }
			
 
				 }
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -121,6 +121,7 @@ noinst_PROGRAMS =				\
 
				 	datawizard/acquire_cb_insert		\
			
 
				 	datawizard/acquire_release		\
			
 
				 	datawizard/acquire_release2		\
			
 
				+	datawizard/copy				\
			
 
				 	datawizard/data_implicit_deps		\
			
 
				 	datawizard/data_lookup			\
			
 
				 	datawizard/scratch			\
			
--- a/tests/datawizard/copy.c
+++ b/tests/datawizard/copy.c
@@ -0,0 +1,102 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				+static unsigned nloops = 1000;
			
 
				+
			
 
				+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static starpu_codelet cpu_codelet =
			
 
				+{
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_func = dummy_func,
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+static starpu_codelet gpu_codelet =
			
 
				+{
			
 
				+        .where = STARPU_CUDA|STARPU_OPENCL,
			
 
				+        .cuda_func = dummy_func,
			
 
				+        .opencl_func = dummy_func,
			
 
				+        .model = NULL,
			
 
				+        .nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+        float foo;
			
 
				+	starpu_data_handle float_array_handle;
			
 
				+        int i;
			
 
				+
			
 
				+        starpu_init(NULL);
			
 
				+
			
 
				+	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+        foo = 0.0f;
			
 
				+	starpu_variable_data_register(&float_array_handle, 0, (uintptr_t)&foo, sizeof(foo));
			
 
				+
			
 
				+        for (i = 0; i < nloops; i++)
			
 
				+        {
			
 
				+		struct starpu_task *task_cpu, *task_gpu;
			
 
				+                int ret;
			
 
				+
			
 
				+		task_cpu = starpu_task_create();
			
 
				+		task_gpu = starpu_task_create();
			
 
				+
			
 
				+		task_cpu->cl = &cpu_codelet;
			
 
				+		task_cpu->callback_func = NULL;
			
 
				+		task_cpu->buffers[0].handle = float_array_handle;
			
 
				+		task_cpu->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+		task_gpu->cl = &gpu_codelet;
			
 
				+		task_gpu->callback_func = NULL;
			
 
				+		task_gpu->buffers[0].handle = float_array_handle;
			
 
				+		task_gpu->buffers[0].mode = STARPU_RW;
			
 
				+
			
 
				+		ret = starpu_task_submit(task_cpu);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+
			
 
				+		ret = starpu_task_submit(task_gpu);
			
 
				+		if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+		{
			
 
				+			FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+			exit(0);
			
 
				+		}
			
 
				+        }
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(float_array_handle);
			
 
				+        starpu_shutdown();
			
 
				+
			
 
				+        return 0;
			
 
				+}
			
--- a/tests/datawizard/dining_philosophers.c
+++ b/tests/datawizard/dining_philosophers.c
@@ -23,6 +23,8 @@
 
				 starpu_data_handle fork_handles[N];
			
 
				 unsigned forks[N];
			
 
				 
			
 
				+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
			
 
				+
			
 
				 static void eat_kernel(void *descr[], void *arg)
			
 
				 {
			
 
				 }
			
@@ -78,6 +80,12 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_task_wait_for_all();
			
 
				 
			
 
				+	FPRINTF(stderr, "waiting done\n");
			
 
				+	for (f = 0; f < N; f++)
			
 
				+	{
			
 
				+		starpu_data_unregister(fork_handles[f]);
			
 
				+	}
			
 
				+
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;