Преглед на файлове

merge with trunk revision 4143

Andra Hugo преди 13 години
родител
ревизия
482baecdf7
променени са 40 файла, в които са добавени 1795 реда и са изтрити 221 реда
  1. 3 1
      Makefile.am
  2. 12 18
      configure.ac
  3. 42 41
      doc/starpu.texi
  4. 67 0
      examples/Makefile.am
  5. 1 1
      examples/basic_examples/block.c
  6. 214 0
      examples/lu/blas_complex.c
  7. 156 0
      examples/lu/blas_complex.h
  8. 19 0
      examples/lu/clu.c
  9. 19 0
      examples/lu/clu_implicit.c
  10. 19 0
      examples/lu/clu_implicit_pivot.c
  11. 19 0
      examples/lu/clu_kernels.c
  12. 19 0
      examples/lu/clu_pivot.c
  13. 52 0
      examples/lu/complex_double.h
  14. 52 0
      examples/lu/complex_float.h
  15. 1 0
      examples/lu/double.h
  16. 1 0
      examples/lu/float.h
  17. 13 0
      examples/lu/lu_example.c
  18. 19 0
      examples/lu/lu_example_complex_double.c
  19. 19 0
      examples/lu/lu_example_complex_float.c
  20. 28 18
      examples/lu/xlu_kernels.c
  21. 19 0
      examples/lu/zlu.c
  22. 19 0
      examples/lu/zlu_implicit.c
  23. 19 0
      examples/lu/zlu_implicit_pivot.c
  24. 19 0
      examples/lu/zlu_kernels.c
  25. 19 0
      examples/lu/zlu_pivot.c
  26. 19 1
      gcc-plugin/examples/Makefile.am
  27. 127 0
      gcc-plugin/examples/stencil5.c
  28. 6 0
      gcc-plugin/src/starpu.c
  29. 9 4
      gcc-plugin/tests/run-test.in
  30. 3 0
      gcc-plugin/tests/task-errors.c
  31. 5 7
      include/starpu_scheduler.h
  32. 7 7
      src/core/perfmodel/perfmodel_bus.c
  33. 219 14
      src/core/perfmodel/perfmodel_history.c
  34. 1 1
      src/core/workers.c
  35. 5 1
      src/drivers/opencl/driver_opencl.c
  36. 3 1
      src/sched_policies/deque_modeling_policy_data_aware.c
  37. 410 106
      src/sched_policies/detect_combined_workers.c
  38. 1 0
      tests/Makefile.am
  39. 102 0
      tests/datawizard/copy.c
  40. 8 0
      tests/datawizard/dining_philosophers.c

+ 3 - 1
Makefile.am

@@ -65,13 +65,15 @@ if BUILD_STARPU_TOP
 all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
-	cd starpu-top ; $(MAKE) clean
+	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
 # TODO: resources
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
 uninstall-local:
 	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
+	$(RM) starpu-top/StarPU-Top
+	$(RM) starpu-top/Makefile
 endif
 
 if STARPU_HAVE_WINDOWS

+ 12 - 18
configure.ac

@@ -594,9 +594,11 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
     	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
         if test "$have_valid_opencl" = "no" ; then
             for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
-    	        STARPU_CHECK_OPENCL($f, "no", "no")
-                if test "$have_valid_opencl" = "yes" ; then
-                    break
+                if test -n $f ; then
+    	            STARPU_CHECK_OPENCL($f, "no", "no")
+                    if test "$have_valid_opencl" = "yes" ; then
+                        break
+                    fi
                 fi
             done
         fi
@@ -1389,24 +1391,11 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
+  chmod +x gcc-plugin/tests/run-test
 ])
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
-AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
-
-if test $build_gcc_plugin == "yes" ; then
-    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
-    AC_OUTPUT([
-	    gcc-plugin/Makefile
-	    gcc-plugin/src/Makefile
-	    gcc-plugin/tests/Makefile
-	    gcc-plugin/tests/run-test
-	    gcc-plugin/examples/Makefile
-            ])
-    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
-            chmod +x gcc-plugin/tests/run-test
-            ])
-fi 
+AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/src/starpu-gcc-config.h)
 
 AC_OUTPUT([
 	Makefile
@@ -1425,6 +1414,11 @@ AC_OUTPUT([
 	doc/Makefile
 	mpi/Makefile
 	starpu-top/StarPU-Top.pro
+        gcc-plugin/Makefile
+	gcc-plugin/src/Makefile
+	gcc-plugin/tests/Makefile
+	gcc-plugin/tests/run-test
+	gcc-plugin/examples/Makefile
 ])
 
 AC_MSG_NOTICE([

+ 42 - 41
doc/starpu.texi

@@ -409,7 +409,7 @@ configuration:
 * Setting flags for compiling and linking applications::  
 * Running a basic StarPU application::  
 * Kernel threads started by StarPU::
-* Using accelerators::          
+* Enabling OpenCL::
 @end menu
 
 @node Setting flags for compiling and linking applications
@@ -462,14 +462,20 @@ installed. This step is done only once per user and per machine.
 @node Kernel threads started by StarPU
 @section Kernel threads started by StarPU
 
-TODO: StarPU starts one thread per CPU core and binds them there, uses one of
-them per GPU. The application is not supposed to do computations in its own
-threads. TODO: add a StarPU function to bind an application thread (e.g. the
-main thread) to a dedicated core (and thus disable the corresponding StarPU CPU
-worker).
+StarPU automatically binds one thread per CPU core. It does not use
+SMT/hyperthreading because kernels are usually already optimized for using a
+full core, and using hyperthreading would make kernel calibration rather random.
 
-@node Using accelerators
-@section Using accelerators
+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
+
+While StarPU tasks are executing, the application is not supposed to do
+computations in the threads it starts itself, tasks should be used instead.
+
+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
+
+@node Enabling OpenCL
+@section Enabling OpenCL
 
 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
@@ -477,8 +483,26 @@ This design choice was necessary as OpenCL and CUDA can not run at the
 same time on the same NVIDIA GPU, as there is currently no interoperability
 between them.
 
-Details on how to specify devices running OpenCL and the ones running
-CUDA are given in @ref{Enabling OpenCL}.
+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
+
+@example
+% ./configure --disable-cuda
+@end example
+
+or when running applications:
+
+@example
+% STARPU_NCUDA=0 ./application
+@end example
+
+OpenCL will automatically be started on any device not yet used by
+CUDA. So on a machine running 4 GPUS, it is therefore possible to
+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
+so:
+
+@example
+% STARPU_NCUDA=2 ./application
+@end example
 
 
 @c ---------------------------------------------------------------------
@@ -1110,7 +1134,7 @@ or for example, by disabling CPU devices:
 @end smallexample
 
 or by disabling CUDA devices (which may permit to enable the use of OpenCL,
-see @ref{Using accelerators}):
+see @ref{Enabling OpenCL}):
 
 @smallexample
 % STARPU_NCUDA=0 ./vector_scal
@@ -2153,7 +2177,9 @@ very natural way by the means of asynchronous interactions between the
 application and StarPU.  This is implemented in a separate libstarpumpi library
 which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
 @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
-GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.
+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
+the different MPI nodes.
 
 @menu
 * The API::                     
@@ -3909,11 +3935,13 @@ This partitions a vector into blocks of the same size.
 
 
 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a vector into blocks of sizes given in @var{filter_arg_ptr}.
+This partitions a vector into blocks of sizes given in the @var{filter_arg_ptr}
+field of @var{f}, supposed to point on a @code{uint32_t*} array.
 @end deftypefun
 
 @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a vector into two blocks, the first block size being given in @var{filter_arg}.
+This partitions a vector into two blocks, the first block size being given in
+the @var{filter_arg} field of @var{f}.
 @end deftypefun
 
 
@@ -4601,38 +4629,11 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 @section OpenCL extensions
 
 @menu
-* Enabling OpenCL::            Enabling OpenCL
 * Compiling OpenCL kernels::   Compiling OpenCL kernels
 * Loading OpenCL kernels::     Loading OpenCL kernels
 * OpenCL statistics::          Collecting statistics from OpenCL
 @end menu
 
-@node Enabling OpenCL
-@subsection Enabling OpenCL
-
-On GPU devices which can run both CUDA and OpenCL, CUDA will be
-enabled by default. To enable OpenCL, you need either to disable CUDA
-when configuring StarPU:
-
-@example
-% ./configure --disable-cuda
-@end example
-
-or when running applications:
-
-@example
-% STARPU_NCUDA=0 ./application
-@end example
-
-OpenCL will automatically be started on any device not yet used by
-CUDA. So on a machine running 4 GPUS, it is therefore possible to
-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
-so:
-
-@example
-% STARPU_NCUDA=2 ./application
-@end example
-
 @node Compiling OpenCL kernels
 @subsection Compiling OpenCL kernels
 

+ 67 - 0
examples/Makefile.am

@@ -112,6 +112,9 @@ noinst_HEADERS = 				\
 	lu/xlu_kernels.h			\
 	lu/float.h				\
 	lu/double.h				\
+	lu/complex_float.h			\
+	lu/complex_double.h			\
+	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
 	common/blas_model.h			\
 	common/blas.h				\
@@ -195,6 +198,14 @@ examplebin_PROGRAMS +=				\
 	cg/cg
 endif
 
+if MKL_BLAS_LIB
+examplebin_PROGRAMS +=				\
+	lu/lu_example_complex_float		\
+	lu/lu_example_complex_double		\
+	lu/lu_implicit_example_complex_float	\
+	lu/lu_implicit_example_complex_double
+endif
+
 if ATLAS_BLAS_LIB
 examplebin_PROGRAMS +=				\
 	spmv/dw_block_spmv
@@ -244,6 +255,14 @@ STARPU_EXAMPLES +=				\
 	cg/cg
 endif
 
+if MKL_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	lu/lu_example_complex_float		\
+	lu/lu_example_complex_double		\
+	lu/lu_implicit_example_complex_float	\
+	lu/lu_implicit_example_complex_double
+endif
+
 if ATLAS_BLAS_LIB
 STARPU_EXAMPLES +=				\
 	spmv/dw_block_spmv
@@ -466,8 +485,56 @@ lu_lu_implicit_example_double_SOURCES =		\
 
 lu_lu_implicit_example_double_LDADD =		\
 	$(STARPU_BLAS_LDFLAGS)
+
+if MKL_BLAS_LIB
+lu_lu_example_complex_float_SOURCES =		\
+	lu/lu_example_complex_float.c		\
+	lu/clu.c				\
+	lu/clu_pivot.c				\
+	lu/clu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_example_complex_float_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_implicit_example_complex_float_SOURCES =	\
+	lu/lu_example_complex_float.c		\
+	lu/clu_implicit.c			\
+	lu/clu_implicit_pivot.c			\
+	lu/clu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_implicit_example_complex_float_LDADD =	\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_example_complex_double_SOURCES =		\
+	lu/lu_example_complex_double.c		\
+	lu/zlu.c				\
+	lu/zlu_pivot.c				\
+	lu/zlu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_example_complex_double_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_implicit_example_complex_double_SOURCES =	\
+	lu/lu_example_complex_double.c		\
+	lu/zlu_implicit.c			\
+	lu/zlu_implicit_pivot.c			\
+	lu/zlu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_implicit_example_complex_double_LDADD =	\
+	$(STARPU_BLAS_LDFLAGS)
+
+endif
 endif
 
+
 ###########################
 # 2 Cholesky in 2 ctxs  example #
 ###########################

+ 1 - 1
examples/basic_examples/block.c

@@ -122,5 +122,5 @@ int main(int argc, char **argv)
 
         starpu_shutdown();
 
-	return 0;
+	return (ret!=1);
 }

+ 214 - 0
examples/lu/blas_complex.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <complex.h>
+
+#include <starpu.h>
+#include "blas_complex.h"
+
+/*
+    This files contains BLAS wrappers for the different BLAS implementations
+  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  libraries do not supply C-based ordering.
+ */
+
+#ifdef STARPU_ATLAS
+#error not implemented
+#elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
+#error not implemented
+#elif defined(STARPU_MKL)
+
+inline void CGEMM(char *transa, char *transb, int M, int N, int K, 
+			complex float alpha, complex float *A, int lda, complex float *B, int ldb, 
+			complex float beta, complex float *C, int ldc)
+{
+	cgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+inline void ZGEMM(char *transa, char *transb, int M, int N, int K, 
+			complex double alpha, complex double *A, int lda, complex double *B, int ldb, 
+			complex double beta, complex double *C, int ldc)
+{
+	zgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+
+inline void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
+		complex float *X, int incX, complex float beta, complex float *Y, int incY)
+{
+	cgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
+		complex double *X, int incX, complex double beta, complex double *Y, int incY)
+{
+	zgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline float SCASUM(int N, complex float *X, int incX)
+{
+	return scasum_(&N, X, &incX);
+}
+
+inline double DZASUM(int N, complex double *X, int incX)
+{
+	return dzasum_(&N, X, &incX);
+}
+
+void CSCAL(int N, complex float alpha, complex float *X, int incX)
+{
+	cscal_(&N, &alpha, X, &incX);
+}
+
+void ZSCAL(int N, complex double alpha, complex double *X, int incX)
+{
+	zscal_(&N, &alpha, X, &incX);
+}
+
+void CTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex float alpha, const complex float *A, const int lda,
+                   complex float *B, const int ldb)
+{
+	ctrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void ZTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex double alpha, const complex double *A, const int lda,
+                   complex double *B, const int ldb)
+{
+	ztrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void CSYR (const char *uplo, const int n, const complex float alpha,
+                  const complex float *x, const int incx, complex float *A, const int lda)
+{
+	csyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+}
+
+void CSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const complex float alpha, const complex float *A,
+                   const int lda, const complex float beta, complex float *C,
+                   const int ldc)
+{
+	csyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+}
+
+void CGERU(const int m, const int n, const complex float alpha,
+                  const complex float *x, const int incx, const complex float *y,
+                  const int incy, complex float *A, const int lda)
+{
+	cgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void ZGERU(const int m, const int n, const complex double alpha,
+                  const complex double *x, const int incx, const complex double *y,
+                  const int incy, complex double *A, const int lda)
+{
+	zgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void CTRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const complex float *A, const int lda, complex float *x, 
+                   const int incx)
+{
+	ctrsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+}
+
+void CTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex float alpha, const complex float *A, const int lda,
+                 complex float *B, const int ldb)
+{
+	ctrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void ZTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex double alpha, const complex double *A, const int lda,
+                 complex double *B, const int ldb)
+{
+	ztrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void CTRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const complex float *A, const int lda, complex float *X,
+                 const int incX)
+{
+	ctrmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+}
+
+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incY)
+{
+	caxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY)
+{
+	zaxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+int ICAMAX (const int n, complex float *X, const int incX)
+{
+    int retVal;
+    retVal = icamax_ (&n, X, &incX);
+    return retVal;
+}
+
+int IZAMAX (const int n, complex double *X, const int incX)
+{
+    int retVal;
+    retVal = izamax_ (&n, X, &incX);
+    return retVal;
+}
+
+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy)
+{
+	complex float retVal = 0;
+
+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
+	retVal = (float)cdotu_(&n, x, &incx, y, &incy);
+
+	return retVal;
+}
+
+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy)
+{
+	return zdotu_(&n, x, &incx, y, &incy);
+}
+
+void CSWAP(const int n, complex float *X, const int incX, complex float *Y, const int incY)
+{
+	cswap_(&n, X, &incX, Y, &incY);
+}
+
+void ZSWAP(const int n, complex double *X, const int incX, complex double *Y, const int incY)
+{
+	zswap_(&n, X, &incX, Y, &incY);
+}
+
+
+#else
+#error "no BLAS lib available..."
+#endif

+ 156 - 0
examples/lu/blas_complex.h

@@ -0,0 +1,156 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include <starpu.h>
+#if defined(STARPU_MKL)
+#define MKLcomplex8 complex float
+#define MKLcomplex16 complex double
+#endif
+
+void CGEMM(char *transa, char *transb, int M, int N, int K, complex float alpha, complex float *A, int lda, 
+		complex float *B, int ldb, complex float beta, complex float *C, int ldc);
+void ZGEMM(char *transa, char *transb, int M, int N, int K, complex double alpha, complex double *A, int lda, 
+		complex double *B, int ldb, complex double beta, complex double *C, int ldc);
+void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
+		complex float *X, int incX, complex float beta, complex float *Y, int incY);
+void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
+		complex double *X, int incX, complex double beta, complex double *Y, int incY);
+float SCASUM(int N, complex float *X, int incX);
+double DZASUM(int N, complex double *X, int incX);
+void CSCAL(int N, complex float alpha, complex float *X, int incX);
+void ZSCAL(int N, complex double alpha, complex double *X, int incX);
+void CTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex float alpha, const complex float *A, const int lda,
+                   complex float *B, const int ldb);
+void ZTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex double alpha, const complex double *A, const int lda,
+                   complex double *B, const int ldb);
+void CSYR (const char *uplo, const int n, const complex float alpha,
+                  const complex float *x, const int incx, complex float *A, const int lda);
+void CSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const complex float alpha, const complex float *A,
+                   const int lda, const complex float beta, complex float *C,
+                   const int ldc);
+void CGERU (const int m, const int n, const complex float alpha,
+                  const complex float *x, const int incx, const complex float *y,
+                  const int incy, complex float *A, const int lda);
+void ZGERU(const int m, const int n, const complex double alpha,
+                  const complex double *x, const int incx, const complex double *y,
+                  const int incy, complex double *A, const int lda);
+void CTRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const complex float *A, const int lda, complex float *x, 
+                   const int incx);
+void CTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex float alpha, const complex float *A, const int lda,
+                 complex float *B, const int ldb);
+void ZTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex double alpha, const complex double *A, const int lda,
+                 complex double *B, const int ldb);
+void CTRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const complex float *A, const int lda, complex float *X,
+                 const int incX);
+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incy);
+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY);
+int ICAMAX (const int n, complex float *X, const int incX);
+int IZAMAX (const int n, complex double *X, const int incX);
+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy);
+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy);
+void CSWAP(const int n, complex float *x, const int incx, complex float *y, const int incy);
+void ZSWAP(const int n, complex double *x, const int incx, complex double *y, const int incy);
+
+#if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
+#error not implemented
+#elif defined(STARPU_MKL)
+
+extern void cgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const complex float *alpha, 
+                   const complex float *A, const int *lda, const complex float *B, 
+                   const int *ldb, const complex float *beta, complex float *C, 
+                   const int *ldc);
+extern void zgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const complex double *alpha, 
+                   const complex double *A, const int *lda, const complex double *B, 
+                   const int *ldb, const complex double *beta, complex double *C, 
+                   const int *ldc);
+extern void cgemv_(const char *trans, int *m, int *n, complex float *alpha,
+                   void *a, int *lda, void *x, int *incx, 
+                   complex float *beta, void *y, int *incy);
+extern void zgemv_(const char *trans, int *m, int *n, complex double *alpha,
+                   void *a, int *lda, void *x, int *incx,
+                   complex double *beta, void *y, int *incy);
+extern void csyr_ (const char *uplo, const int *n, const complex float *alpha,
+                  const complex float *x, const int *incx, complex float *A, const int *lda);
+extern void csyrk_ (const char *uplo, const char *trans, const int *n,
+                   const int *k, const complex float *alpha, const complex float *A,
+                   const int *lda, const complex float *beta, complex float *C,
+                   const int *ldc);
+extern void ctrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const complex float *alpha, const complex float *A, const int *lda,
+                   complex float *B, const int *ldb);
+extern void ztrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const complex double *alpha, const complex double *A, const int *lda,
+                   complex double *B, const int *ldb);
+extern complex double scasum_ (const int *n, const complex float *x, const int *incx);
+extern complex double dzasum_ (const int *n, const complex double *x, const int *incx);
+extern void cscal_ (const int *n, const complex float *alpha, complex float *x,
+                   const int *incx);
+extern void zscal_ (const int *n, const complex double *alpha, complex double *x,
+                   const int *incx);
+extern void cgeru_(const int *m, const int *n, const complex float *alpha,
+                  const complex float *x, const int *incx, const complex float *y,
+                  const int *incy, complex float *A, const int *lda);
+extern void zgeru_(const int *m, const int *n, const complex double *alpha,
+                  const complex double *x, const int *incx, const complex double *y,
+                  const int *incy, complex double *A, const int *lda);
+extern void ctrsv_ (const char *uplo, const char *trans, const char *diag, 
+                   const int *n, const complex float *A, const int *lda, complex float *x, 
+                   const int *incx);
+extern void ctrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const complex float *alpha, const complex float *A, const int *lda,
+                 complex float *B, const int *ldb);
+extern void ztrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const complex double *alpha, const complex double *A, const int *lda,
+                 complex double *B, const int *ldb);
+extern void ctrmv_(const char *uplo, const char *transA, const char *diag,
+                 const int *n, const complex float *A, const int *lda, complex float *X,
+                 const int *incX);
+extern void caxpy_(const int *n, const complex float *alpha, complex float *X, const int *incX,
+		complex float *Y, const int *incy);
+extern void zaxpy_(const int *n, const complex double *alpha, complex double *X, const int *incX,
+		complex double *Y, const int *incy);
+extern int icamax_(const int *n, complex float *X, const int *incX);
+extern int izamax_(const int *n, complex double *X, const int *incX);
+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
+extern complex double cdotu_(const int *n, const complex float *x, const int *incx, const complex float *y, const int *incy);
+extern complex double zdotu_(const int *n, const complex double *x, const int *incx, const complex double *y, const int *incy);
+extern void cswap_(const int *n, complex float *x, const int *incx, complex float *y, const int *incy);
+extern void zswap_(const int *n, complex double *x, const int *incx, complex double *y, const int *incy);
+
+#endif
+
+#endif /* __BLAS_COMPLEX_H__ */

+ 19 - 0
examples/lu/clu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu.c"

+ 19 - 0
examples/lu/clu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_implicit.c"

+ 19 - 0
examples/lu/clu_implicit_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_implicit_pivot.c"

+ 19 - 0
examples/lu/clu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_kernels.c"

+ 19 - 0
examples/lu/clu_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_pivot.c"

+ 52 - 0
examples/lu/complex_double.h

@@ -0,0 +1,52 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <complex.h>
+#include "blas_complex.h"
+#define TYPE complex double
+#define CUBLAS_TYPE cuDoubleComplex
+
+#define STARPU_LU(name)       starpu_zlu_##name
+#define COMPLEX_LU
+
+#ifdef STARPU_HAVE_MAGMA
+#include <magmablas.h>
+#define CUBLAS_GEMM	magmablas_zgemm
+#define CUBLAS_TRSM	magmablas_ztrsm
+#else
+#define CUBLAS_GEMM	cublasZgemm
+#define CUBLAS_TRSM	cublasZtrsm
+#endif
+
+#define CUBLAS_SCAL	cublasZscal
+#define CUBLAS_GER	cublasZgeru
+#define CUBLAS_SWAP	cublasZswap
+#define CUBLAS_IAMAX	cublasIzamax
+
+#define CPU_GEMM	ZGEMM
+#define CPU_TRSM	ZTRSM
+#define CPU_SCAL	ZSCAL
+#define CPU_GER		ZGERU
+#define CPU_SWAP	ZSWAP
+
+#define CPU_TRMM	ZTRMM
+#define CPU_AXPY	ZAXPY
+#define CPU_ASUM	DZASUM
+#define CPU_IAMAX	IZAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 52 - 0
examples/lu/complex_float.h

@@ -0,0 +1,52 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <complex.h>
+#include "blas_complex.h"
+#define TYPE complex float
+#define CUBLAS_TYPE cuComplex
+
+#define STARPU_LU(name)       starpu_clu_##name
+#define COMPLEX_LU
+
+#ifdef STARPU_HAVE_MAGMA
+#include <magmablas.h>
+#define CUBLAS_GEMM	magmablas_cgemm
+#define CUBLAS_TRSM	magmablas_ctrsm
+#else
+#define CUBLAS_GEMM	cublasCgemm
+#define CUBLAS_TRSM	cublasCtrsm
+#endif
+
+#define CUBLAS_SCAL	cublasCscal
+#define CUBLAS_GER	cublasCgeru
+#define CUBLAS_SWAP	cublasCswap
+#define CUBLAS_IAMAX	cublasIcamax
+
+#define CPU_GEMM	CGEMM
+#define CPU_TRSM	CTRSM
+#define CPU_SCAL	CSCAL
+#define CPU_GER		CGERU
+#define CPU_SWAP	CSWAP
+
+#define CPU_TRMM	CTRMM
+#define CPU_AXPY	CAXPY
+#define CPU_ASUM	SCASUM
+#define CPU_IAMAX	ICAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 1 - 0
examples/lu/double.h

@@ -16,6 +16,7 @@
  */
 
 #define TYPE double
+#define CUBLAS_TYPE TYPE
 
 #define STARPU_LU(name)       starpu_dlu_##name
 

+ 1 - 0
examples/lu/float.h

@@ -17,6 +17,7 @@
 
 
 #define TYPE float
+#define CUBLAS_TYPE TYPE
 
 #define STARPU_LU(name)       starpu_slu_##name
 

+ 13 - 0
examples/lu/lu_example.c

@@ -164,6 +164,10 @@ static void init_matrix(void)
 		for (i = 0; i < size; i++)
 		{
 			A[i + j*size] = (TYPE)starpu_drand48();
+#ifdef COMPLEX_LU
+			/* also randomize the imaginary component for complex number cases */
+			A[i + j*size] += (TYPE)(I*starpu_drand48());
+#endif
 		}
 	}
 
@@ -249,11 +253,20 @@ static void check_result(void)
 	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
 	display_matrix(L, size, size, "Residuals");
 	
+#ifdef COMPLEX_LU
+	double err = CPU_ASUM(size*size, L, 1);
+	int max = CPU_IAMAX(size*size, L, 1);
+	TYPE l_max = L[max];
+
+	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
+	FPRINTF(stderr, "Max error : %e\n", sqrt(creal(l_max)*creal(l_max)+cimag(l_max)*cimag(l_max)));
+#else
 	TYPE err = CPU_ASUM(size*size, L, 1);
 	int max = CPU_IAMAX(size*size, L, 1);
 
 	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
 	FPRINTF(stderr, "Max error : %e\n", L[max]);
+#endif
 
 	double residual = frobenius_norm(L, size);
 	double matnorm = frobenius_norm(A_saved, size);

+ 19 - 0
examples/lu/lu_example_complex_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "lu_example.c"

+ 19 - 0
examples/lu/lu_example_complex_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "lu_example.c"

+ 28 - 18
examples/lu/xlu_kernels.c

@@ -22,6 +22,11 @@
 #define xstr(s)        str(s)
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
 
+#ifdef STARPU_USE_CUDA
+static const TYPE p1 =  1.0f;
+static const TYPE m1 = -1.0f;
+#endif
+
 /*
  *   U22 
  */
@@ -54,10 +59,10 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			break;
 
 #ifdef STARPU_USE_CUDA
-		case 1:
+		case 1: {
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
-				(TYPE)-1.0, right, ld21, left, ld12,
-				(TYPE)1.0f, center, ld22);
+				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
+				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
 
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
@@ -67,6 +72,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
+		}
 #endif
 		default:
 			STARPU_ABORT();
@@ -140,7 +146,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 #ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
-					(TYPE)1.0, sub11, ld11, sub12, ld12);
+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
@@ -221,7 +227,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 #ifdef STARPU_USE_CUDA
 		case 1:
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
-					(TYPE)1.0, sub11, ld11, sub21, ld21);
+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
@@ -307,17 +313,19 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
+				TYPE inv_pivot;
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaStreamSynchronize(0);
 
 				STARPU_ASSERT(pivot != 0.0);
 				
-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				inv_pivot = 1.0/pivot;
+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
-						&sub11[(z+1)+z*ld], 1,
-						&sub11[z+(z+1)*ld], ld,
-						&sub11[(z+1) + (z+1)*ld],ld);
+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			
 			cudaThreadSynchronize();
@@ -423,20 +431,21 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
+				TYPE inv_pivot;
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaStreamSynchronize(0);
 
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				{
 					/* find the pivot */
-					int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
+					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
 	
 					ipiv[z + first] = piv_ind + z + first;
 
 					/* swap if needed */
 					if (piv_ind != 0)
 					{
-						CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
+						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
 					}
 
 					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
@@ -445,12 +454,13 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
 				STARPU_ASSERT(pivot != 0.0);
 				
-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				inv_pivot = 1.0/pivot;
+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
-						&sub11[(z+1)+z*ld], 1,
-						&sub11[z+(z+1)*ld], ld,
-						&sub11[(z+1) + (z+1)*ld],ld);
+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 				
 			}
 
@@ -534,7 +544,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				unsigned rowpiv = ipiv[row+first] - first;
 				if (rowpiv != row)
 				{
-					CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
+					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
 				}
 			}
 

+ 19 - 0
examples/lu/zlu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu.c"

+ 19 - 0
examples/lu/zlu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_implicit.c"

+ 19 - 0
examples/lu/zlu_implicit_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_implicit_pivot.c"

+ 19 - 0
examples/lu/zlu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_kernels.c"

+ 19 - 0
examples/lu/zlu_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_pivot.c"

+ 19 - 1
gcc-plugin/examples/Makefile.am

@@ -14,12 +14,30 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 noinst_PROGRAMS =				\
-  matrix-mult
+  matrix-mult stencil5
+
+if !NO_BLAS_LIB
+noinst_PROGRAMS +=				\
+  cholesky/cholesky
+endif
 
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
 
 AM_CPPFLAGS =						\
   -I$(top_srcdir)/include				\
+  -I$(top_srcdir)/examples				\
   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
 
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
+
+if !NO_BLAS_LIB
+cholesky_cholesky_SOURCES	=		\
+	cholesky/cholesky.c		\
+	cholesky/cholesky_models.c	\
+	cholesky/cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_cholesky_LDADD	=	\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+

+ 127 - 0
gcc-plugin/examples/stencil5.c

@@ -0,0 +1,127 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifndef STARPU_GCC_PLUGIN
+# error must be compiled with the StarPU GCC plug-in
+#endif
+
+/* Definition of the StarPU task and its CPU implementation.  */
+static void stencil5(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+	__attribute__ ((task));
+
+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+	__attribute__ ((task_implementation ("cpu", stencil5)));
+
+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+{
+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
+}
+
+#define NITER_DEF 20000
+#define X         10
+#define Y         10
+
+int display = 0;
+int niter = NITER_DEF;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-iter") == 0) {
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+	}
+}
+
+static float my_rand (void)
+{
+	return (float) rand () / (float) RAND_MAX;
+}
+
+int main(int argc, char **argv)
+{
+        int x, y;
+        float mean=0;
+        float matrix[X][Y];
+
+        parse_args(argc, argv);
+
+	srand (time (NULL));
+        for(x = 0; x < X; x++) {
+                for (y = 0; y < Y; y++) {
+                        matrix[x][y] = my_rand () * 100;
+                        mean += matrix[x][y];
+                }
+        }
+        mean /= (x*y);
+
+        if (display) {
+                fprintf(stdout, "mean=%f\n", mean);
+                for(x = 0; x < X; x++) {
+                        for (y = 0; y < Y; y++) {
+                                fprintf(stdout, "%3f ", matrix[x][y]);
+                        }
+                        fprintf(stdout, "\n");
+                }
+        }
+
+#pragma starpu initialize
+
+        for(x = 0; x < X; x++) {
+		for (y = 0; y < Y; y++) {
+#pragma starpu register &matrix[x][y] 1
+		}
+	}
+
+	while(niter--) {
+                for (x = 1; x < X-1; x++) {
+                        for (y = 1; y < Y-1; y++) {
+                                stencil5(&matrix[x][y], &matrix[x-1][y], &matrix[x+1][y],
+					 &matrix[x][y-1], &matrix[x][y+1]);
+                        }
+                }
+        }
+
+#pragma starpu wait
+
+        for(x = 0; x < X; x++) {
+                for (y = 0; y < Y; y++) {
+#pragma starpu unregister &matrix[x][y]
+                }
+        }
+
+#pragma starpu shutdown
+
+        if (display) {
+                fprintf(stdout, "mean=%f\n", mean);
+                for(x = 0; x < X; x++) {
+                        for (y = 0; y < Y; y++) {
+                                fprintf(stdout, "%3f ", matrix[x][y]);
+                        }
+                        fprintf(stdout, "\n");
+                }
+        }
+
+	return EXIT_SUCCESS;
+}

+ 6 - 0
gcc-plugin/src/starpu.c

@@ -690,6 +690,12 @@ handle_task_attribute (tree *node, tree name, tree args,
 	      "%<task%> attribute only applies to functions");
   else
     {
+      if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (fn))))
+	/* Raise an error but keep going to avoid spitting out too many
+	   errors at the user's face.  */
+	error_at (DECL_SOURCE_LOCATION (fn),
+		  "task return type must be %<void%>");
+
       /* This is a function declaration for something local to this
 	 translation unit, so add the `task' attribute to FN.  */
       *no_add_attrs = false;

+ 9 - 4
gcc-plugin/tests/run-test.in

@@ -261,10 +261,15 @@ unsatisfied directives."
 
 (define (executable-file source)
   "Return the name of the executable file corresponding to SOURCE."
-  (let ((dot (string-rindex source #\.)))
-    (if dot
-        (substring source 0 dot)
-        (string-append source ".exe"))))
+  (let* ((dot (string-rindex source #\.))
+         (exe (if dot
+                  (substring source 0 dot)
+                  (string-append source ".exe")))
+         )
+  (if (string-prefix? %srcdir exe)
+      (string-append %builddir (substring exe (string-length %srcdir)))
+      exe
+      )))
 
 (define (compile/match file cc cflags ldflags)
   "Read directives from FILE, and compiler/link/run it.  Make sure directives

+ 3 - 0
gcc-plugin/tests/task-errors.c

@@ -49,6 +49,9 @@ static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string cons
 static void my_task_with_a_body (int foo, char *bar)
   __attribute__ ((task, unused));
 
+extern int my_task_not_void (int foo) /* (error "return type") */
+  __attribute__ ((task));
+
 
 static void
 my_task_cpu (int foo, float *bar)

+ 5 - 7
include/starpu_scheduler.h

@@ -78,15 +78,13 @@ struct starpu_sched_policy_s {
 
 	/* Insert a task into the scheduler. */
         int (*push_task)(struct starpu_task *, unsigned);
-	/* Notify the scheduler that a task was pushed on the worker. This
-	 * method is called when a task that was explicitely assigned to a
-	 * worker is scheduled. This method therefore permits to keep the state
-	 * of of the scheduler coherent even when StarPU bypasses the
-	 * scheduling strategy. */
+	/* Notify the scheduler that a task was directly pushed to the worker
+	 * without going through the scheduler. This method is called when a
+	 * task is explicitely assigned to a worker. This method therefore
+	 * permits to keep the timing state of the scheduler coherent even
+	 * when StarPU bypasses the scheduling strategy. */
 	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned);
 
-	/* Insert a priority task into the scheduler. */
-        int (*push_prio_task)(struct starpu_task *, unsigned);
 
 	/* Get a task from the scheduler. The mutex associated to the worker is
 	 * already taken when this method is called. */

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -429,12 +429,12 @@ static void benchmark_all_gpu_devices(void)
 #endif
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = config->topology.ncpus;
 
 	/* TODO: measure bandwidth between GPU-GPU */
 
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (i = 0; i < ncuda; i++)
 	{
 		fprintf(stderr," CUDA %d...", i);
@@ -481,7 +481,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
@@ -509,11 +509,11 @@ static void load_bus_affinity_file_content(void)
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = config->topology.ncpus;
         int gpu;
 
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 		int ret;
@@ -1020,9 +1020,9 @@ static void check_bus_config_file()
                 fclose(f);
 
                 // Loading current configuration
-                ncpus = _starpu_topology_get_nhwcpu(config);
+                ncpus = config->topology.ncpus;
 #ifdef STARPU_USE_CUDA
-                cudaGetDeviceCount(&ncuda);
+		ncuda = _starpu_get_cuda_device_count();
 #endif
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();

+ 219 - 14
src/core/perfmodel/perfmodel_history.c

@@ -204,17 +204,117 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 	}
 }
 
-static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
+static void parse_arch(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history, unsigned archmin, unsigned archmax, int skiparch)
 {
-	unsigned arch;
-	unsigned nimpl;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
-			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
+	unsigned arch, impl;
+	struct starpu_per_arch_perfmodel_t dummy;
+	int nimpls, implmax, skipimpl;
+	unsigned ret;
+	
+
+	for (arch = archmin; arch < archmax; arch++) {
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &nimpls);
+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
+		for (impl = 0; impl < implmax; impl++) {
+			parse_per_arch_model_file(f, &model->per_arch[arch][impl], scan_history);
+		}
+		if (skipimpl > 0) {
+			for (impl = 0; impl < skipimpl; impl++) {
+				parse_per_arch_model_file(f, &dummy, 0);
+			}
+		}
+	}
+
+	if (skiparch > 0) {
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &nimpls);
+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
+		for (arch = 0; arch < skiparch; arch ++) {
+			for (impl = 0; impl < implmax; impl++) {
+				parse_per_arch_model_file(f, &dummy, 0);
+			}
+			if (skipimpl > 0) {
+				for (impl = 0; impl < skipimpl; impl++) {
+					parse_per_arch_model_file(f, &dummy, 0);
+				}
+			}
 		}
 	}
 }
 
+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
+{
+	unsigned ret;
+	unsigned archmin = 0;
+	unsigned max_gordondevs = 1; /* XXX : we need a STARPU_MAXGORDONDEVS cst */
+	unsigned narchs;
+	int nimpls;
+
+	/* We could probably write a clean loop here, but the code would not
+	 * really be easier to read. */
+
+	/* Parsing CPUs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u CPUs\n", narchs);
+	if (narchs > 0)
+	{
+		parse_arch(f, model, scan_history,
+				archmin,
+				STARPU_MIN(narchs, STARPU_MAXCPUS),
+				narchs - STARPU_MAXCPUS);
+	}
+
+	/* Parsing CUDA devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+	_STARPU_DEBUG("Parsing %u CUDA devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXCPUS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + STARPU_MIN(narchs, STARPU_MAXCUDADEVS),
+				narchs - STARPU_MAXCUDADEVS);
+	}
+
+	/* Parsing OpenCL devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u OpenCL devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXCUDADEVS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
+				narchs - STARPU_MAXOPENCLDEVS);
+	}
+
+	/* Parsing Gordon implementations */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u Gordon devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXOPENCLDEVS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + max_gordondevs,
+				narchs - max_gordondevs);
+	}
+}
+
 
 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
 {
@@ -235,7 +335,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 		}
 	}
 
+	if (nentries == 0)
+		return;
 	/* header */
+	char archname[32];
+	starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
+	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
 	dump_reg_model(f, model, arch, nimpl);
@@ -250,23 +355,123 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 			ptr = ptr->next;
 		}
 	}
+
+	fprintf(f, "\n##################\n");
 }
 
-static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
+static unsigned get_n_entries(struct starpu_perfmodel_t *model, unsigned arch, unsigned impl)
 {
-	fprintf(f, "#################\n");
+	struct starpu_per_arch_perfmodel_t *per_arch_model;
+	per_arch_model = &model->per_arch[arch][impl];
+	/* count the number of elements in the lists */
+	struct starpu_history_list_t *ptr = NULL;
+	unsigned nentries = 0;
+
+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+	{
+		/* Dump the list of all entries in the history */
+		ptr = per_arch_model->list;
+		while(ptr) {
+			nentries++;
+			ptr = ptr->next;
+		}
+	}
+	return nentries;
+}
 
+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
+{
+	unsigned number_of_archs[4] = { 0, 0, 0, 0};
 	unsigned arch;
 	unsigned nimpl;
+	unsigned idx = 0;
+
+	/* Finding the number of archs to write for each kind of device */
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	{
+		switch (arch)
+		{
+			case STARPU_CUDA_DEFAULT:
+			case STARPU_OPENCL_DEFAULT:
+			case STARPU_GORDON_DEFAULT:
+				idx++;
+				break;
+			default:
+				break;
+		}
+
+		unsigned nentries = 0;
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			nentries = get_n_entries(model, arch, nimpl) != 0;
+			if (nentries > 0)
+			{
+				number_of_archs[idx]++;
+				break;
+			}
+		}
+	}
+
+	/* Writing stuff */
+	char *name;
+	unsigned substract_to_arch = 0;
+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
+	{
+		switch (arch)
+		{
+			case STARPU_CPU_DEFAULT:
+				name = "CPU";
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[0]);
+				break;
+			case STARPU_CUDA_DEFAULT:
+				name = "CUDA";
+				substract_to_arch = STARPU_MAXCPUS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[1]);
+				break;
+			case STARPU_OPENCL_DEFAULT:
+				name = "OPENCL";
+				substract_to_arch += STARPU_MAXCUDADEVS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[2]);
+				break;
+			case STARPU_GORDON_DEFAULT:
+				name = "GORDON";
+				substract_to_arch += STARPU_MAXOPENCLDEVS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[3]);
+				break;
+			default:
+				break;
+		}
+
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
-			char archname[32];
-			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
-			fprintf(f, "# Model for %s\n", archname);
+			if (get_n_entries(model, arch, nimpl) == 0)
+				break;
+
+		}
+		unsigned max_impl = nimpl;
+
+		if (max_impl == 0)
+			continue;
+
+		fprintf(f, "###########\n");
+		fprintf(f, "# %s_%u\n", name, arch - substract_to_arch);
+		fprintf(f, "# number of implementations\n");
+		fprintf(f, "%u\n", max_impl);
+		for (nimpl = 0; nimpl < max_impl; nimpl++)
+		{
 			dump_per_arch_model_file(f, model, arch, nimpl);
-			fprintf(f, "\n##################\n");
 		}
 	}
 }
@@ -300,7 +505,7 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
@@ -351,7 +556,7 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);

+ 1 - 1
src/core/workers.c

@@ -87,7 +87,7 @@ static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct
 			cl->opencl_funcs[nimpl] == NULL);
 	case STARPU_GORDON_WORKER:
 		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
-			cl->gordon_funcs[nimpl] == NULL);
+			cl->gordon_funcs[nimpl] == 0);
 	default:
 		return 0;
 	}

+ 5 - 1
src/drivers/opencl/driver_opencl.c

@@ -357,6 +357,11 @@ void _starpu_opencl_init(void)
                 // Get location of OpenCl kernel source files
                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
 
+		if (nb_devices > STARPU_MAXOPENCLDEVS) {
+			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
+			nb_devices = STARPU_MAXOPENCLDEVS;
+		}
+
                 // initialise internal structures
                 for(i=0 ; i<nb_devices ; i++) {
                         contexts[i] = NULL;
@@ -538,7 +543,6 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 	struct timespec codelet_start, codelet_end;
 
-	int workerid = args->workerid;
 	STARPU_ASSERT(task);
 	struct starpu_codelet_t *cl = task->cl;
 	STARPU_ASSERT(cl);

+ 3 - 1
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -478,6 +478,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 					) {
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
+
 			}
 
 			if (local_task_length[worker_in_ctx] == -1.0)
@@ -492,7 +493,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 				unknown = 1;
 
 			if (unknown)
-				continue;
+					continue;
 
 			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
 
@@ -501,6 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 				/* a better solution was found */
 				best_exp_end = exp_end[worker_in_ctx];
 				best_impl = nimpl;
+
 			}
 
 			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);

+ 410 - 106
src/sched_policies/detect_combined_workers.c

@@ -21,150 +21,454 @@
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
-#endif
 
-#ifdef STARPU_HAVE_HWLOC
-/* This function returns 1 the subtree induced by obj only contains CPU
- * workers, otherwise 0 is returned. This function registers all valid worker
- * combination below obj. The id of the CPU workers are put in the worker_array
- * and their count is put in the worker_cnt pointer. */
-static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array, int *worker_cnt)
+/* tree_t
+ * ======
+ * Purpose
+ * =======
+ * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
+ * object and storing every workers it contained in every sub-trees by recursion.
+ *
+ * Fields
+ * ======
+ * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
+ *
+ * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
+ 			or in this very object.
+ *
+ * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
+ */
+
+typedef struct tree_s{
+    hwloc_obj_t obj;
+    unsigned nb_workers;
+    int *workers;
+} tree_t;
+
+
+/* gather_trees
+ * ============
+ * Purpose
+ * =======
+ * Gather all the workers of every source tree in one target tree.
+ * We assume the target array of workers is big enough to contain all the workers.
+ *
+ * Arguments
+ * =========
+ * target_tree		(input, output)
+ *			Pointer to the tree which will contain all the workers of every source.
+ *
+ * source_trees		(input)
+ *			Array of trees we want to combine in a unique tree.
+ *
+ * nb_source_trees	(input)
+ *			Number of trees we want to combine (size of the array).
+ */
+
+static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
+{
+    unsigned tree_id, worker_id, index = 0;
+    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
+	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
+	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
+}
+
+/* assign_multiple_trees
+ * ========================
+ * Purpose
+ * =======
+ * Assign every tree which is large enough (greater than min_size) and merge small ones.
+ * If there is no tree large enough to be assigned any more, we return.
+ *
+ * Return value
+ * ============
+ * The number of workers assigned during the function.
+ *
+ * Arguments
+ * =========
+ * trees		(input, output)
+ *			In entry, array of trees to assign. In the end at most one tree still contains workers.
+ *
+ * nb_trees		(input)
+ *			The number of trees (size of the array).
+ *
+ * min_size		(input)
+ *			Minimum size of a combined worker.
+ *
+ * max_size		(input)
+ *			Maximum size of a combined worker.
+ */
+
+static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    unsigned short complete = 0;
+    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
+
+    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
+	nb_workers_total += trees[tree_id].nb_workers;;
 
-	/* Is this a leaf ? (eg. a PU for hwloc) */
-	int is_leaf = !hwloc_compare_types(config->cpu_depth, obj->depth);
+    while(!complete)
+    {
+	complete = 1;
 
-	if (is_leaf)
+	/* First we manage to assign every subtree large enough to be assigned alone */
+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
 	{
-		struct starpu_worker_s *worker = obj->userdata;
+	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
+		continue;
 
-		/* If this is a CPU worker, append its id at the end of the
-		 * list */
-		if (worker && worker->arch == STARPU_CPU_WORKER)
+	    nb_workers_tree = trees[tree_id].nb_workers;
+
+	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
+	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
+	    {
+		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
+		STARPU_ASSERT(ret >= 0);
+		nb_workers_assigned += nb_workers_tree;
+		nb_workers_total -= nb_workers_tree;
+		trees[tree_id].nb_workers = 0;
+	    }
+	}
+
+	/* Then we merge too small subtrees into not too large ones
+	 * if we manage to merge some subtrees we turn the flag
+	 * complete to 0 thus we know he have to start again to assign
+	 * just merged subtrees */
+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
+	{
+	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
+		continue;
+
+	    nb_workers_tree = trees[tree_id].nb_workers;
+
+	    /* We go through the array to find another subtree we can merge with this one */
+	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
+	    {
+		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
+		    continue;
+
+		nb_workers_tree2 = trees[tree_id2].nb_workers;
+
+		/*  We can merge the two subtrees, let's do it */
+		if(nb_workers_tree + nb_workers_tree2 <= max_size)
 		{
-			worker_array[*worker_cnt] = worker->workerid;
-			*worker_cnt = *worker_cnt + 1;
+		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
+			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
+
+		    trees[tree_id].nb_workers += nb_workers_tree2;
+		    trees[tree_id2].nb_workers = 0;
+
+		    /* We just merged two subtrees, we need to restart again and try to assign it */
+		    complete = 0;
+		    break;
 		}
+	    }
 
-		/* We cannot create a combined worker only if there is a CPU
-		 * worker. */
-		return (!worker || worker->arch == STARPU_CPU_WORKER);
+	    if(!complete)
+		break;
 	}
+    }
 
-	/* If there is only one child, we go to the next level directly */
-	if (obj->arity == 1)
-		return find_combinations_with_hwloc_rec(obj->children[0], worker_array, worker_cnt);
+    return nb_workers_assigned;
+}
 
-	/* We recursively go from the root to the leaves of the tree to find
-	 * subtrees that only have CPUs as leaves. */
-	unsigned cpu_children_cnt = 0;
+/* find_and_assign_combinations_with_hwloc_recursive
+ * =================================================
+ * Purpose
+ * =======
+ * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
+ * assign are given back to the calling function to be assigned using data from other subtrees if so.
+ *
+ * Return value
+ * ============
+ * The number of workers left to be assigned.
+ *
+ * Arguments
+ * =========
+ * tree			(input, output)
+ *			Tree structure containing the root to process in entry.
+ *			When the function returns it also contains the number of workers left
+ *			to be assigned and these very workers in the array previously allocated.
+ *
+ * min_size		(input)
+ *			Minimum size of a combined worker.
+ *
+ * max_size		(input)
+ *			Maximum size of a combined worker.
+ */
+
+static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
+{
+    unsigned subtree_id, nb_workers = 0;
 
-	int worker_array_rec[STARPU_NMAXWORKERS];
-	int worker_cnt_rec = 0;
-	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
+    hwloc_obj_t obj = tree->obj;
+    int *workers = tree->workers;
 
-	unsigned i;
-	for (i = 0; i < obj->arity; i++)
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+    /* Is this a leaf ? (eg. a PU for hwloc) */
+    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
+    {
+	struct starpu_worker_s *worker = obj->userdata;
+
+	/* If this is a CPU worker add it at the beginning
+	 * of the array , write 1 in the field nb_workers and
+	 * return the number of CPU workers found : 1 in this case. */
+	if (worker && worker->arch == STARPU_CPU_WORKER)
 	{
-		int valid_subtree = find_combinations_with_hwloc_rec(obj->children[i],
-						worker_array_rec, &worker_cnt_rec);
-		if (valid_subtree)
-			cpu_children_cnt++;
+	    workers[0] = worker->workerid;
+	    tree->nb_workers = 1;
+	    return 1;
 	}
 
-	int child;
+	tree->nb_workers = 0;
+	return 0;
+    }
+
+
+    /* If there is only one child, we go to the next level right away */
+    if (obj->arity == 1)
+    {
+	tree_t subtree = *tree;
+	subtree.obj = obj->children[0];
+	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
+	tree->nb_workers = nb_workers;
+	return nb_workers;
+    }
+
+    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
+     * CPU leaves that fits between min and max. */
+
+    /* We allocate an array of tree structures which will contain the current node's subtrees data */
+    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
+
+    /* We allocate the array containing the workers of each subtree and initialize the fields left */
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+    {
+	tree_t *subtree = subtrees + subtree_id;
+
+	subtree->obj = obj->children[subtree_id];
+	subtree->nb_workers = 0;
+	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
+    }
+
+    /* We recursively go through every subtree and get all the workers which are not assigned yet */
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
+
+    if(nb_workers > max_size)
+    {
+	/* We withdraw the number of workers just assigned from the total number of workers */
+	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
 
-	if (cpu_children_cnt == obj->arity)
-	for (child = 0; child < worker_cnt_rec; child++)
+	/* Some workers are not assigned yet : we gather them in the array
+	 * which is returned to the father which will handle them later */
+	if(nb_workers)
+	    gather_trees(tree, subtrees, obj->arity);
+    }
+    else if(nb_workers < max_size)
+    {
+	gather_trees(tree, subtrees, obj->arity);
+    }
+    else // nb_workers == max_size
+    {
+	gather_trees(tree, subtrees, obj->arity);
+
+	int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
+	STARPU_ASSERT(ret >= 0);
+	nb_workers = 0;
+    }
+
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+	free(subtrees[subtree_id].workers);
+    free(subtrees);
+
+    tree->nb_workers = nb_workers;
+    return nb_workers;
+}
+
+/* get_min_max_sizes
+ * =================================================
+ * Purpose
+ * =======
+ * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
+ * environment variables.
+ * If both of them were not set, then we try do get some efficient values following the rule beneath :
+ *
+ * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
+ * S = square_root(nb_cpus)
+ *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
+ *
+ * If only one of both was not set then we set it with a value relative to the other, for example :
+ *
+ *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
+ *
+ * Arguments
+ * =========
+ * min_size		(output)
+ *			Pointer to the minimum size of a combined worker, whether set with
+ *			value given by the user or processed from the number of cpus.
+ *
+ * max_size		(output)
+ *			Pointer to the maximum size of a combined worker, whether set with
+ *			value given by the user or processed from the number of cpus.
+ *
+ * topology		(input)
+ *			Topology of the machine : used to know the number of cpus.
+ */
+
+static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
+{
+    int _min_size, _max_size;
+
+    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
+
+    /* If the user didn't set both the environment variables,
+     * we need to find a minimum and a maximum size ourselves */
+    if(_min_size <= -1 || _max_size <= -1)
+    {
+
+	int nb_cpus = topology->nhwcpus;
+	int sqrt_nb_cpus = sqrt(nb_cpus);
+	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
+
+	    if(_min_size == -1)
+	    {
+		if(_max_size > -1)
+		    _min_size = _max_size - 1;
+		else
+		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
+	    }
+
+	if(_max_size == -1)
 	{
-		worker_array[*worker_cnt] = worker_array_rec[child];
-		*worker_cnt = *worker_cnt + 1;
+	    if(_min_size > -1)
+		_max_size = _min_size + 1;
+	    else
+		_max_size = sqrt_nb_cpus + 1;
 	}
-	
-	/* If there is at least 2 children that are valid, we combined them. */
-	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+    }
 
-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
-		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
+    *min_size = _min_size;
+    *max_size = _max_size;
 
-	return (cpu_children_cnt == obj->arity);
+    return;
 }
 
-static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
+/* find_and_assign_combinations_with_hwloc
+ * =======================================
+ * Purpose
+ * =======
+ * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
+ * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
+ * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
+ * some workers, we assign them no matter the number for there is no way to respect
+ * the wanted sizes anymore.
+ *
+ * Arguments
+ * =========
+ * topology		(input)
+ *			Topology of the machine : used to know the number of cpus and
+ *			to get the hwloc tree.
+ */
+
+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
 {
-	/* We don't care about the result */
-	int worker_array[STARPU_NMAXWORKERS];
-	int worker_cnt = 0;
-
-	/* We recursively go from the root to the leaves of the tree to find
-	 * subtrees that only have CPUs as leaves. */
-	hwloc_obj_t root;
-	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
-	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
+    unsigned nb_workers;
+    int min_size, max_size;
+
+    get_min_max_sizes(&min_size, &max_size, topology);
+
+    STARPU_ASSERT(min_size <= max_size);
+
+    tree_t tree;
+
+    /* Of course we start from the root */
+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
+    tree.nb_workers = 0;
+    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
+
+    /* We recursively go from the root to the leaves of the tree to find
+     * subtrees that only have CPUs as leaves. */
+    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
+
+    /* There are still some workers left, since the only possibility is that
+     * the number of workers left is less than the minimum worker size we assign them all */
+    if(nb_workers > 0)
+    {
+	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
+	 * while there are enough workers to assign regarding the min_size value */
+	STARPU_ASSERT(nb_workers < max_size);
+
+	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
+	STARPU_ASSERT(ret >= 0);
+    }
+
+    free(tree.workers);
 }
 
-#else
+#else /* STARPU_HAVE_HWLOC */
 
-static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
 
-	/* We put the id of all CPU workers in this array */
-	int cpu_workers[STARPU_NMAXWORKERS];
-	unsigned ncpus = 0;
+    /* We put the id of all CPU workers in this array */
+    int cpu_workers[STARPU_NMAXWORKERS];
+    unsigned ncpus = 0;
 
-	unsigned i;
-	for (i = 0; i < topology->nworkers; i++)
-	{
-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-			cpu_workers[ncpus++] = i;
-	}
-	
-	unsigned size;
-	for (size = 2; size <= ncpus; size *= 2)
+    unsigned i;
+    for (i = 0; i < topology->nworkers; i++)
+    {
+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
+	    cpu_workers[ncpus++] = i;
+    }
+
+    unsigned size;
+    for (size = 2; size <= ncpus; size *= 2)
+    {
+	unsigned first_cpu;
+	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
 	{
-		unsigned first_cpu;
-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-		{
-			if (first_cpu + size <= ncpus)
-			{
-				int workerids[size];
-
-				for (i = 0; i < size; i++)
-					workerids[i] = cpu_workers[first_cpu + i];
-
-				/* We register this combination */
-				int ret;
-				ret = starpu_combined_worker_assign_workerid(size, workerids); 
-				STARPU_ASSERT(ret >= 0);
-			}
-		}
+	    if (first_cpu + size <= ncpus)
+	    {
+		int workerids[size];
+
+		for (i = 0; i < size; i++)
+		    workerids[i] = cpu_workers[first_cpu + i];
+
+		/* We register this combination */
+		int ret;
+		ret = starpu_combined_worker_assign_workerid(size, workerids); 
+		STARPU_ASSERT(ret >= 0);
+	    }
 	}
+    }
 }
-#endif
+
+#endif /* STARPU_HAVE_HWLOC */
 
 static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
 
-	int cpu_workers[STARPU_NMAXWORKERS];
-	unsigned ncpus = 0;
+    int cpu_workers[STARPU_NMAXWORKERS];
+    unsigned ncpus = 0;
 
-	unsigned i;
-	for (i = 0; i < topology->nworkers; i++)
-	{
-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-			cpu_workers[ncpus++] = i;
-	}
+    unsigned i;
+    for (i = 0; i < topology->nworkers; i++)
+    {
+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
+	    cpu_workers[ncpus++] = i;
+    }
 
-	if (ncpus > 0)
-	{
-		int ret;
-		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
-		STARPU_ASSERT(ret >= 0);
-	}
+    if (ncpus > 0)
+    {
+	int ret;
+	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
+	STARPU_ASSERT(ret >= 0);
+    }
 }
 
 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
@@ -175,9 +479,9 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 	combine_all_cpu_workers(topology);
     else {
 #ifdef STARPU_HAVE_HWLOC
-	find_combinations_with_hwloc(topology);
+	find_and_assign_combinations_with_hwloc(topology);
 #else
-	find_combinations_without_hwloc(topology);
+	find_and_assign_combinations_without_hwloc(topology);
 #endif
     }
 }

+ 1 - 0
tests/Makefile.am

@@ -121,6 +121,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
+	datawizard/copy				\
 	datawizard/data_implicit_deps		\
 	datawizard/data_lookup			\
 	datawizard/scratch			\

+ 102 - 0
tests/datawizard/copy.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+static unsigned nloops = 1000;
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static starpu_codelet cpu_codelet =
+{
+        .where = STARPU_CPU,
+        .cpu_func = dummy_func,
+        .model = NULL,
+        .nbuffers = 1
+};
+
+static starpu_codelet gpu_codelet =
+{
+        .where = STARPU_CUDA|STARPU_OPENCL,
+        .cuda_func = dummy_func,
+        .opencl_func = dummy_func,
+        .model = NULL,
+        .nbuffers = 1
+};
+
+
+int main(int argc, char **argv)
+{
+        float foo;
+	starpu_data_handle float_array_handle;
+        int i;
+
+        starpu_init(NULL);
+
+	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
+	{
+		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
+		starpu_shutdown();
+		return 77;
+	}
+
+        foo = 0.0f;
+	starpu_variable_data_register(&float_array_handle, 0, (uintptr_t)&foo, sizeof(foo));
+
+        for (i = 0; i < nloops; i++)
+        {
+		struct starpu_task *task_cpu, *task_gpu;
+                int ret;
+
+		task_cpu = starpu_task_create();
+		task_gpu = starpu_task_create();
+
+		task_cpu->cl = &cpu_codelet;
+		task_cpu->callback_func = NULL;
+		task_cpu->buffers[0].handle = float_array_handle;
+		task_cpu->buffers[0].mode = STARPU_RW;
+
+		task_gpu->cl = &gpu_codelet;
+		task_gpu->callback_func = NULL;
+		task_gpu->buffers[0].handle = float_array_handle;
+		task_gpu->buffers[0].mode = STARPU_RW;
+
+		ret = starpu_task_submit(task_cpu);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+
+		ret = starpu_task_submit(task_gpu);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+        }
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(float_array_handle);
+        starpu_shutdown();
+
+        return 0;
+}

+ 8 - 0
tests/datawizard/dining_philosophers.c

@@ -23,6 +23,8 @@
 starpu_data_handle fork_handles[N];
 unsigned forks[N];
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static void eat_kernel(void *descr[], void *arg)
 {
 }
@@ -78,6 +80,12 @@ int main(int argc, char **argv)
 
 	starpu_task_wait_for_all();
 
+	FPRINTF(stderr, "waiting done\n");
+	for (f = 0; f < N; f++)
+	{
+		starpu_data_unregister(fork_handles[f]);
+	}
+
 	starpu_shutdown();
 
 	return 0;