Browse Source

merge with trunk revision 4143

Andra Hugo 13 years ago
parent
commit
482baecdf7
40 changed files with 1795 additions and 221 deletions
  1. 3 1
      Makefile.am
  2. 12 18
      configure.ac
  3. 42 41
      doc/starpu.texi
  4. 67 0
      examples/Makefile.am
  5. 1 1
      examples/basic_examples/block.c
  6. 214 0
      examples/lu/blas_complex.c
  7. 156 0
      examples/lu/blas_complex.h
  8. 19 0
      examples/lu/clu.c
  9. 19 0
      examples/lu/clu_implicit.c
  10. 19 0
      examples/lu/clu_implicit_pivot.c
  11. 19 0
      examples/lu/clu_kernels.c
  12. 19 0
      examples/lu/clu_pivot.c
  13. 52 0
      examples/lu/complex_double.h
  14. 52 0
      examples/lu/complex_float.h
  15. 1 0
      examples/lu/double.h
  16. 1 0
      examples/lu/float.h
  17. 13 0
      examples/lu/lu_example.c
  18. 19 0
      examples/lu/lu_example_complex_double.c
  19. 19 0
      examples/lu/lu_example_complex_float.c
  20. 28 18
      examples/lu/xlu_kernels.c
  21. 19 0
      examples/lu/zlu.c
  22. 19 0
      examples/lu/zlu_implicit.c
  23. 19 0
      examples/lu/zlu_implicit_pivot.c
  24. 19 0
      examples/lu/zlu_kernels.c
  25. 19 0
      examples/lu/zlu_pivot.c
  26. 19 1
      gcc-plugin/examples/Makefile.am
  27. 127 0
      gcc-plugin/examples/stencil5.c
  28. 6 0
      gcc-plugin/src/starpu.c
  29. 9 4
      gcc-plugin/tests/run-test.in
  30. 3 0
      gcc-plugin/tests/task-errors.c
  31. 5 7
      include/starpu_scheduler.h
  32. 7 7
      src/core/perfmodel/perfmodel_bus.c
  33. 219 14
      src/core/perfmodel/perfmodel_history.c
  34. 1 1
      src/core/workers.c
  35. 5 1
      src/drivers/opencl/driver_opencl.c
  36. 3 1
      src/sched_policies/deque_modeling_policy_data_aware.c
  37. 410 106
      src/sched_policies/detect_combined_workers.c
  38. 1 0
      tests/Makefile.am
  39. 102 0
      tests/datawizard/copy.c
  40. 8 0
      tests/datawizard/dining_philosophers.c

+ 3 - 1
Makefile.am

@@ -65,13 +65,15 @@ if BUILD_STARPU_TOP
 all-local:
 all-local:
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 	cd starpu-top ; $(QMAKE) ; $(MAKE)
 clean-local:
 clean-local:
-	cd starpu-top ; $(MAKE) clean
+	cd starpu-top ; $(QMAKE) ; $(MAKE) clean ; $(RM) Makefile
 # TODO: resources
 # TODO: resources
 install-exec-local:
 install-exec-local:
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(MKDIR_P) $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
 	$(INSTALL_STRIP_PROGRAM) starpu-top/StarPU-Top $(DESTDIR)$(bindir)
 uninstall-local:
 uninstall-local:
 	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
 	$(RM) $(DESTDIR)$(bindir)/StarPU-Top
+	$(RM) starpu-top/StarPU-Top
+	$(RM) starpu-top/Makefile
 endif
 endif
 
 
 if STARPU_HAVE_WINDOWS
 if STARPU_HAVE_WINDOWS

+ 12 - 18
configure.ac

@@ -594,9 +594,11 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
     	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
     	STARPU_CHECK_OPENCL($opencl_dir, $opencl_include_dir, $opencl_lib_dir)
         if test "$have_valid_opencl" = "no" ; then
         if test "$have_valid_opencl" = "no" ; then
             for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
             for f in "/usr/local/cuda" "/c/cuda" "/cygdrive/c/cuda" "/opt/cuda" "$CUDA_INC_PATH" "$CUDA_INSTALL_PATH" ; do
-    	        STARPU_CHECK_OPENCL($f, "no", "no")
-                if test "$have_valid_opencl" = "yes" ; then
-                    break
+                if test -n $f ; then
+    	            STARPU_CHECK_OPENCL($f, "no", "no")
+                    if test "$have_valid_opencl" = "yes" ; then
+                        break
+                    fi
                 fi
                 fi
             done
             done
         fi
         fi
@@ -1389,24 +1391,11 @@ AM_CONDITIONAL([COND_OPT], [test "$want_optional_tests" = yes])
 # File configuration
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x tests/regression/regression.sh
+  chmod +x gcc-plugin/tests/run-test
 ])
 ])
 
 
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
 AC_CONFIG_FILES(tests/regression/regression.sh tests/regression/profiles tests/regression/profiles.build.only)
-AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h)
-
-if test $build_gcc_plugin == "yes" ; then
-    AC_CONFIG_HEADERS([gcc-plugin/src/starpu-gcc-config.h])
-    AC_OUTPUT([
-	    gcc-plugin/Makefile
-	    gcc-plugin/src/Makefile
-	    gcc-plugin/tests/Makefile
-	    gcc-plugin/tests/run-test
-	    gcc-plugin/examples/Makefile
-            ])
-    AC_CONFIG_COMMANDS([executable-plugin-scripts], [
-            chmod +x gcc-plugin/tests/run-test
-            ])
-fi 
+AC_CONFIG_HEADER(src/common/config.h include/starpu_config.h gcc-plugin/src/starpu-gcc-config.h)
 
 
 AC_OUTPUT([
 AC_OUTPUT([
 	Makefile
 	Makefile
@@ -1425,6 +1414,11 @@ AC_OUTPUT([
 	doc/Makefile
 	doc/Makefile
 	mpi/Makefile
 	mpi/Makefile
 	starpu-top/StarPU-Top.pro
 	starpu-top/StarPU-Top.pro
+        gcc-plugin/Makefile
+	gcc-plugin/src/Makefile
+	gcc-plugin/tests/Makefile
+	gcc-plugin/tests/run-test
+	gcc-plugin/examples/Makefile
 ])
 ])
 
 
 AC_MSG_NOTICE([
 AC_MSG_NOTICE([

+ 42 - 41
doc/starpu.texi

@@ -409,7 +409,7 @@ configuration:
 * Setting flags for compiling and linking applications::  
 * Setting flags for compiling and linking applications::  
 * Running a basic StarPU application::  
 * Running a basic StarPU application::  
 * Kernel threads started by StarPU::
 * Kernel threads started by StarPU::
-* Using accelerators::          
+* Enabling OpenCL::
 @end menu
 @end menu
 
 
 @node Setting flags for compiling and linking applications
 @node Setting flags for compiling and linking applications
@@ -462,14 +462,20 @@ installed. This step is done only once per user and per machine.
 @node Kernel threads started by StarPU
 @node Kernel threads started by StarPU
 @section Kernel threads started by StarPU
 @section Kernel threads started by StarPU
 
 
-TODO: StarPU starts one thread per CPU core and binds them there, uses one of
-them per GPU. The application is not supposed to do computations in its own
-threads. TODO: add a StarPU function to bind an application thread (e.g. the
-main thread) to a dedicated core (and thus disable the corresponding StarPU CPU
-worker).
+StarPU automatically binds one thread per CPU core. It does not use
+SMT/hyperthreading because kernels are usually already optimized for using a
+full core, and using hyperthreading would make kernel calibration rather random.
 
 
-@node Using accelerators
-@section Using accelerators
+Since driving GPUs is a CPU-consuming task, StarPU dedicates one core per GPU
+
+While StarPU tasks are executing, the application is not supposed to do
+computations in the threads it starts itself, tasks should be used instead.
+
+TODO: add a StarPU function to bind an application thread (e.g. the main thread)
+to a dedicated core (and thus disable the corresponding StarPU CPU worker).
+
+@node Enabling OpenCL
+@section Enabling OpenCL
 
 
 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
 OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
@@ -477,8 +483,26 @@ This design choice was necessary as OpenCL and CUDA can not run at the
 same time on the same NVIDIA GPU, as there is currently no interoperability
 same time on the same NVIDIA GPU, as there is currently no interoperability
 between them.
 between them.
 
 
-Details on how to specify devices running OpenCL and the ones running
-CUDA are given in @ref{Enabling OpenCL}.
+To enable OpenCL, you need either to disable CUDA when configuring StarPU:
+
+@example
+% ./configure --disable-cuda
+@end example
+
+or when running applications:
+
+@example
+% STARPU_NCUDA=0 ./application
+@end example
+
+OpenCL will automatically be started on any device not yet used by
+CUDA. So on a machine running 4 GPUS, it is therefore possible to
+enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
+so:
+
+@example
+% STARPU_NCUDA=2 ./application
+@end example
 
 
 
 
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
@@ -1110,7 +1134,7 @@ or for example, by disabling CPU devices:
 @end smallexample
 @end smallexample
 
 
 or by disabling CUDA devices (which may permit to enable the use of OpenCL,
 or by disabling CUDA devices (which may permit to enable the use of OpenCL,
-see @ref{Using accelerators}):
+see @ref{Enabling OpenCL}):
 
 
 @smallexample
 @smallexample
 % STARPU_NCUDA=0 ./vector_scal
 % STARPU_NCUDA=0 ./vector_scal
@@ -2153,7 +2177,9 @@ very natural way by the means of asynchronous interactions between the
 application and StarPU.  This is implemented in a separate libstarpumpi library
 application and StarPU.  This is implemented in a separate libstarpumpi library
 which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
 which basically provides "StarPU" equivalents of @code{MPI_*} functions, where
 @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
 @code{void *} buffers are replaced with @code{starpu_data_handle}s, and all
-GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.
+GPU-RAM-NIC transfers are handled efficiently by StarPU-MPI.  The user has to
+use the usual @code{mpirun} command of the MPI implementation to start StarPU on
+the different MPI nodes.
 
 
 @menu
 @menu
 * The API::                     
 * The API::                     
@@ -3909,11 +3935,13 @@ This partitions a vector into blocks of the same size.
 
 
 
 
 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 @deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a vector into blocks of sizes given in @var{filter_arg_ptr}.
+This partitions a vector into blocks of sizes given in the @var{filter_arg_ptr}
+field of @var{f}, supposed to point on a @code{uint32_t*} array.
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 @deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a vector into two blocks, the first block size being given in @var{filter_arg}.
+This partitions a vector into two blocks, the first block size being given in
+the @var{filter_arg} field of @var{f}.
 @end deftypefun
 @end deftypefun
 
 
 
 
@@ -4601,38 +4629,11 @@ This function synchronously deinitializes the CUBLAS library on every CUDA devic
 @section OpenCL extensions
 @section OpenCL extensions
 
 
 @menu
 @menu
-* Enabling OpenCL::            Enabling OpenCL
 * Compiling OpenCL kernels::   Compiling OpenCL kernels
 * Compiling OpenCL kernels::   Compiling OpenCL kernels
 * Loading OpenCL kernels::     Loading OpenCL kernels
 * Loading OpenCL kernels::     Loading OpenCL kernels
 * OpenCL statistics::          Collecting statistics from OpenCL
 * OpenCL statistics::          Collecting statistics from OpenCL
 @end menu
 @end menu
 
 
-@node Enabling OpenCL
-@subsection Enabling OpenCL
-
-On GPU devices which can run both CUDA and OpenCL, CUDA will be
-enabled by default. To enable OpenCL, you need either to disable CUDA
-when configuring StarPU:
-
-@example
-% ./configure --disable-cuda
-@end example
-
-or when running applications:
-
-@example
-% STARPU_NCUDA=0 ./application
-@end example
-
-OpenCL will automatically be started on any device not yet used by
-CUDA. So on a machine running 4 GPUS, it is therefore possible to
-enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
-so:
-
-@example
-% STARPU_NCUDA=2 ./application
-@end example
-
 @node Compiling OpenCL kernels
 @node Compiling OpenCL kernels
 @subsection Compiling OpenCL kernels
 @subsection Compiling OpenCL kernels
 
 

+ 67 - 0
examples/Makefile.am

@@ -112,6 +112,9 @@ noinst_HEADERS = 				\
 	lu/xlu_kernels.h			\
 	lu/xlu_kernels.h			\
 	lu/float.h				\
 	lu/float.h				\
 	lu/double.h				\
 	lu/double.h				\
+	lu/complex_float.h			\
+	lu/complex_double.h			\
+	lu/blas_complex.h			\
 	cholesky/cholesky.h			\
 	cholesky/cholesky.h			\
 	common/blas_model.h			\
 	common/blas_model.h			\
 	common/blas.h				\
 	common/blas.h				\
@@ -195,6 +198,14 @@ examplebin_PROGRAMS +=				\
 	cg/cg
 	cg/cg
 endif
 endif
 
 
+if MKL_BLAS_LIB
+examplebin_PROGRAMS +=				\
+	lu/lu_example_complex_float		\
+	lu/lu_example_complex_double		\
+	lu/lu_implicit_example_complex_float	\
+	lu/lu_implicit_example_complex_double
+endif
+
 if ATLAS_BLAS_LIB
 if ATLAS_BLAS_LIB
 examplebin_PROGRAMS +=				\
 examplebin_PROGRAMS +=				\
 	spmv/dw_block_spmv
 	spmv/dw_block_spmv
@@ -244,6 +255,14 @@ STARPU_EXAMPLES +=				\
 	cg/cg
 	cg/cg
 endif
 endif
 
 
+if MKL_BLAS_LIB
+STARPU_EXAMPLES +=				\
+	lu/lu_example_complex_float		\
+	lu/lu_example_complex_double		\
+	lu/lu_implicit_example_complex_float	\
+	lu/lu_implicit_example_complex_double
+endif
+
 if ATLAS_BLAS_LIB
 if ATLAS_BLAS_LIB
 STARPU_EXAMPLES +=				\
 STARPU_EXAMPLES +=				\
 	spmv/dw_block_spmv
 	spmv/dw_block_spmv
@@ -466,8 +485,56 @@ lu_lu_implicit_example_double_SOURCES =		\
 
 
 lu_lu_implicit_example_double_LDADD =		\
 lu_lu_implicit_example_double_LDADD =		\
 	$(STARPU_BLAS_LDFLAGS)
 	$(STARPU_BLAS_LDFLAGS)
+
+if MKL_BLAS_LIB
+lu_lu_example_complex_float_SOURCES =		\
+	lu/lu_example_complex_float.c		\
+	lu/clu.c				\
+	lu/clu_pivot.c				\
+	lu/clu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_example_complex_float_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_implicit_example_complex_float_SOURCES =	\
+	lu/lu_example_complex_float.c		\
+	lu/clu_implicit.c			\
+	lu/clu_implicit_pivot.c			\
+	lu/clu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_implicit_example_complex_float_LDADD =	\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_example_complex_double_SOURCES =		\
+	lu/lu_example_complex_double.c		\
+	lu/zlu.c				\
+	lu/zlu_pivot.c				\
+	lu/zlu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_example_complex_double_LDADD =		\
+	$(STARPU_BLAS_LDFLAGS)
+
+lu_lu_implicit_example_complex_double_SOURCES =	\
+	lu/lu_example_complex_double.c		\
+	lu/zlu_implicit.c			\
+	lu/zlu_implicit_pivot.c			\
+	lu/zlu_kernels.c			\
+	lu/blas_complex.c			\
+	common/blas.c
+
+lu_lu_implicit_example_complex_double_LDADD =	\
+	$(STARPU_BLAS_LDFLAGS)
+
+endif
 endif
 endif
 
 
+
 ###########################
 ###########################
 # 2 Cholesky in 2 ctxs  example #
 # 2 Cholesky in 2 ctxs  example #
 ###########################
 ###########################

+ 1 - 1
examples/basic_examples/block.c

@@ -122,5 +122,5 @@ int main(int argc, char **argv)
 
 
         starpu_shutdown();
         starpu_shutdown();
 
 
-	return 0;
+	return (ret!=1);
 }
 }

+ 214 - 0
examples/lu/blas_complex.c

@@ -0,0 +1,214 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <complex.h>
+
+#include <starpu.h>
+#include "blas_complex.h"
+
+/*
+    This files contains BLAS wrappers for the different BLAS implementations
+  (eg. REFBLAS, STARPU_ATLAS, GOTOBLAS ...). We assume a Fortran orientation as most
+  libraries do not supply C-based ordering.
+ */
+
+#ifdef STARPU_ATLAS
+#error not implemented
+#elif defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
+#error not implemented
+#elif defined(STARPU_MKL)
+
+inline void CGEMM(char *transa, char *transb, int M, int N, int K, 
+			complex float alpha, complex float *A, int lda, complex float *B, int ldb, 
+			complex float beta, complex float *C, int ldc)
+{
+	cgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+inline void ZGEMM(char *transa, char *transb, int M, int N, int K, 
+			complex double alpha, complex double *A, int lda, complex double *B, int ldb, 
+			complex double beta, complex double *C, int ldc)
+{
+	zgemm_(transa, transb, &M, &N, &K, &alpha,
+			 A, &lda, B, &ldb,
+			 &beta, C, &ldc);	
+}
+
+
+inline void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
+		complex float *X, int incX, complex float beta, complex float *Y, int incY)
+{
+	cgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
+		complex double *X, int incX, complex double beta, complex double *Y, int incY)
+{
+	zgemv_(transa, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+}
+
+inline float SCASUM(int N, complex float *X, int incX)
+{
+	return scasum_(&N, X, &incX);
+}
+
+inline double DZASUM(int N, complex double *X, int incX)
+{
+	return dzasum_(&N, X, &incX);
+}
+
+void CSCAL(int N, complex float alpha, complex float *X, int incX)
+{
+	cscal_(&N, &alpha, X, &incX);
+}
+
+void ZSCAL(int N, complex double alpha, complex double *X, int incX)
+{
+	zscal_(&N, &alpha, X, &incX);
+}
+
+void CTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex float alpha, const complex float *A, const int lda,
+                   complex float *B, const int ldb)
+{
+	ctrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void ZTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex double alpha, const complex double *A, const int lda,
+                   complex double *B, const int ldb)
+{
+	ztrsm_(side, uplo, transa, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void CSYR (const char *uplo, const int n, const complex float alpha,
+                  const complex float *x, const int incx, complex float *A, const int lda)
+{
+	csyr_(uplo, &n, &alpha, x, &incx, A, &lda); 
+}
+
+void CSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const complex float alpha, const complex float *A,
+                   const int lda, const complex float beta, complex float *C,
+                   const int ldc)
+{
+	csyrk_(uplo, trans, &n, &k, &alpha, A, &lda, &beta, C, &ldc); 
+}
+
+void CGERU(const int m, const int n, const complex float alpha,
+                  const complex float *x, const int incx, const complex float *y,
+                  const int incy, complex float *A, const int lda)
+{
+	cgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void ZGERU(const int m, const int n, const complex double alpha,
+                  const complex double *x, const int incx, const complex double *y,
+                  const int incy, complex double *A, const int lda)
+{
+	zgeru_(&m, &n, &alpha, x, &incx, y, &incy, A, &lda);
+}
+
+void CTRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const complex float *A, const int lda, complex float *x, 
+                   const int incx)
+{
+	ctrsv_(uplo, trans, diag, &n, A, &lda, x, &incx);
+}
+
+void CTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex float alpha, const complex float *A, const int lda,
+                 complex float *B, const int ldb)
+{
+	ctrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void ZTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex double alpha, const complex double *A, const int lda,
+                 complex double *B, const int ldb)
+{
+	ztrmm_(side, uplo, transA, diag, &m, &n, &alpha, A, &lda, B, &ldb);
+}
+
+void CTRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const complex float *A, const int lda, complex float *X,
+                 const int incX)
+{
+	ctrmv_(uplo, transA, diag, &n, A, &lda, X, &incX);
+}
+
+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incY)
+{
+	caxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY)
+{
+	zaxpy_(&n, &alpha, X, &incX, Y, &incY);
+}
+
+int ICAMAX (const int n, complex float *X, const int incX)
+{
+    int retVal;
+    retVal = icamax_ (&n, X, &incX);
+    return retVal;
+}
+
+int IZAMAX (const int n, complex double *X, const int incX)
+{
+    int retVal;
+    retVal = izamax_ (&n, X, &incX);
+    return retVal;
+}
+
+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy)
+{
+	complex float retVal = 0;
+
+	/* GOTOBLAS will return a FLOATRET which is a double, not a float */
+	retVal = (float)cdotu_(&n, x, &incx, y, &incy);
+
+	return retVal;
+}
+
+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy)
+{
+	return zdotu_(&n, x, &incx, y, &incy);
+}
+
+void CSWAP(const int n, complex float *X, const int incX, complex float *Y, const int incY)
+{
+	cswap_(&n, X, &incX, Y, &incY);
+}
+
+void ZSWAP(const int n, complex double *X, const int incX, complex double *Y, const int incY)
+{
+	zswap_(&n, X, &incX, Y, &incY);
+}
+
+
+#else
+#error "no BLAS lib available..."
+#endif

+ 156 - 0
examples/lu/blas_complex.h

@@ -0,0 +1,156 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __BLAS_H__
+#define __BLAS_H__
+
+#include <starpu.h>
+#if defined(STARPU_MKL)
+#define MKLcomplex8 complex float
+#define MKLcomplex16 complex double
+#endif
+
+void CGEMM(char *transa, char *transb, int M, int N, int K, complex float alpha, complex float *A, int lda, 
+		complex float *B, int ldb, complex float beta, complex float *C, int ldc);
+void ZGEMM(char *transa, char *transb, int M, int N, int K, complex double alpha, complex double *A, int lda, 
+		complex double *B, int ldb, complex double beta, complex double *C, int ldc);
+void CGEMV(char *transa, int M, int N, complex float alpha, complex float *A, int lda,
+		complex float *X, int incX, complex float beta, complex float *Y, int incY);
+void ZGEMV(char *transa, int M, int N, complex double alpha, complex double *A, int lda,
+		complex double *X, int incX, complex double beta, complex double *Y, int incY);
+float SCASUM(int N, complex float *X, int incX);
+double DZASUM(int N, complex double *X, int incX);
+void CSCAL(int N, complex float alpha, complex float *X, int incX);
+void ZSCAL(int N, complex double alpha, complex double *X, int incX);
+void CTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex float alpha, const complex float *A, const int lda,
+                   complex float *B, const int ldb);
+void ZTRSM (const char *side, const char *uplo, const char *transa,
+                   const char *diag, const int m, const int n,
+                   const complex double alpha, const complex double *A, const int lda,
+                   complex double *B, const int ldb);
+void CSYR (const char *uplo, const int n, const complex float alpha,
+                  const complex float *x, const int incx, complex float *A, const int lda);
+void CSYRK (const char *uplo, const char *trans, const int n,
+                   const int k, const complex float alpha, const complex float *A,
+                   const int lda, const complex float beta, complex float *C,
+                   const int ldc);
+void CGERU (const int m, const int n, const complex float alpha,
+                  const complex float *x, const int incx, const complex float *y,
+                  const int incy, complex float *A, const int lda);
+void ZGERU(const int m, const int n, const complex double alpha,
+                  const complex double *x, const int incx, const complex double *y,
+                  const int incy, complex double *A, const int lda);
+void CTRSV (const char *uplo, const char *trans, const char *diag, 
+                   const int n, const complex float *A, const int lda, complex float *x, 
+                   const int incx);
+void CTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex float alpha, const complex float *A, const int lda,
+                 complex float *B, const int ldb);
+void ZTRMM(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int m, const int n,
+                 const complex double alpha, const complex double *A, const int lda,
+                 complex double *B, const int ldb);
+void CTRMV(const char *uplo, const char *transA, const char *diag,
+                 const int n, const complex float *A, const int lda, complex float *X,
+                 const int incX);
+void CAXPY(const int n, const complex float alpha, complex float *X, const int incX, complex float *Y, const int incy);
+void ZAXPY(const int n, const complex double alpha, complex double *X, const int incX, complex double *Y, const int incY);
+int ICAMAX (const int n, complex float *X, const int incX);
+int IZAMAX (const int n, complex double *X, const int incX);
+complex float CDOTU(const int n, const complex float *x, const int incx, const complex float *y, const int incy);
+complex double ZDOTU(const int n, const complex double *x, const int incx, const complex double *y, const int incy);
+void CSWAP(const int n, complex float *x, const int incx, complex float *y, const int incy);
+void ZSWAP(const int n, complex double *x, const int incx, complex double *y, const int incy);
+
+#if defined(STARPU_GOTO) || defined(STARPU_SYSTEM_BLAS)
+#error not implemented
+#elif defined(STARPU_MKL)
+
+extern void cgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const complex float *alpha, 
+                   const complex float *A, const int *lda, const complex float *B, 
+                   const int *ldb, const complex float *beta, complex float *C, 
+                   const int *ldc);
+extern void zgemm_ (const char *transa, const char *transb, const int *m,
+                   const int *n, const int *k, const complex double *alpha, 
+                   const complex double *A, const int *lda, const complex double *B, 
+                   const int *ldb, const complex double *beta, complex double *C, 
+                   const int *ldc);
+extern void cgemv_(const char *trans, int *m, int *n, complex float *alpha,
+                   void *a, int *lda, void *x, int *incx, 
+                   complex float *beta, void *y, int *incy);
+extern void zgemv_(const char *trans, int *m, int *n, complex double *alpha,
+                   void *a, int *lda, void *x, int *incx,
+                   complex double *beta, void *y, int *incy);
+extern void csyr_ (const char *uplo, const int *n, const complex float *alpha,
+                  const complex float *x, const int *incx, complex float *A, const int *lda);
+extern void csyrk_ (const char *uplo, const char *trans, const int *n,
+                   const int *k, const complex float *alpha, const complex float *A,
+                   const int *lda, const complex float *beta, complex float *C,
+                   const int *ldc);
+extern void ctrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const complex float *alpha, const complex float *A, const int *lda,
+                   complex float *B, const int *ldb);
+extern void ztrsm_ (const char *side, const char *uplo, const char *transa, 
+                   const char *diag, const int *m, const int *n,
+                   const complex double *alpha, const complex double *A, const int *lda,
+                   complex double *B, const int *ldb);
+extern complex double scasum_ (const int *n, const complex float *x, const int *incx);
+extern complex double dzasum_ (const int *n, const complex double *x, const int *incx);
+extern void cscal_ (const int *n, const complex float *alpha, complex float *x,
+                   const int *incx);
+extern void zscal_ (const int *n, const complex double *alpha, complex double *x,
+                   const int *incx);
+extern void cgeru_(const int *m, const int *n, const complex float *alpha,
+                  const complex float *x, const int *incx, const complex float *y,
+                  const int *incy, complex float *A, const int *lda);
+extern void zgeru_(const int *m, const int *n, const complex double *alpha,
+                  const complex double *x, const int *incx, const complex double *y,
+                  const int *incy, complex double *A, const int *lda);
+extern void ctrsv_ (const char *uplo, const char *trans, const char *diag, 
+                   const int *n, const complex float *A, const int *lda, complex float *x, 
+                   const int *incx);
+extern void ctrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const complex float *alpha, const complex float *A, const int *lda,
+                 complex float *B, const int *ldb);
+extern void ztrmm_(const char *side, const char *uplo, const char *transA,
+                 const char *diag, const int *m, const int *n,
+                 const complex double *alpha, const complex double *A, const int *lda,
+                 complex double *B, const int *ldb);
+extern void ctrmv_(const char *uplo, const char *transA, const char *diag,
+                 const int *n, const complex float *A, const int *lda, complex float *X,
+                 const int *incX);
+extern void caxpy_(const int *n, const complex float *alpha, complex float *X, const int *incX,
+		complex float *Y, const int *incy);
+extern void zaxpy_(const int *n, const complex double *alpha, complex double *X, const int *incX,
+		complex double *Y, const int *incy);
+extern int icamax_(const int *n, complex float *X, const int *incX);
+extern int izamax_(const int *n, complex double *X, const int *incX);
+/* for some reason, FLOATRET is not a float but a double in GOTOBLAS */
+extern complex double cdotu_(const int *n, const complex float *x, const int *incx, const complex float *y, const int *incy);
+extern complex double zdotu_(const int *n, const complex double *x, const int *incx, const complex double *y, const int *incy);
+extern void cswap_(const int *n, complex float *x, const int *incx, complex float *y, const int *incy);
+extern void zswap_(const int *n, complex double *x, const int *incx, complex double *y, const int *incy);
+
+#endif
+
+#endif /* __BLAS_COMPLEX_H__ */

+ 19 - 0
examples/lu/clu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu.c"

+ 19 - 0
examples/lu/clu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_implicit.c"

+ 19 - 0
examples/lu/clu_implicit_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_implicit_pivot.c"

+ 19 - 0
examples/lu/clu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_kernels.c"

+ 19 - 0
examples/lu/clu_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "xlu_pivot.c"

+ 52 - 0
examples/lu/complex_double.h

@@ -0,0 +1,52 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <complex.h>
+#include "blas_complex.h"
+#define TYPE complex double
+#define CUBLAS_TYPE cuDoubleComplex
+
+#define STARPU_LU(name)       starpu_zlu_##name
+#define COMPLEX_LU
+
+#ifdef STARPU_HAVE_MAGMA
+#include <magmablas.h>
+#define CUBLAS_GEMM	magmablas_zgemm
+#define CUBLAS_TRSM	magmablas_ztrsm
+#else
+#define CUBLAS_GEMM	cublasZgemm
+#define CUBLAS_TRSM	cublasZtrsm
+#endif
+
+#define CUBLAS_SCAL	cublasZscal
+#define CUBLAS_GER	cublasZgeru
+#define CUBLAS_SWAP	cublasZswap
+#define CUBLAS_IAMAX	cublasIzamax
+
+#define CPU_GEMM	ZGEMM
+#define CPU_TRSM	ZTRSM
+#define CPU_SCAL	ZSCAL
+#define CPU_GER		ZGERU
+#define CPU_SWAP	ZSWAP
+
+#define CPU_TRMM	ZTRMM
+#define CPU_AXPY	ZAXPY
+#define CPU_ASUM	DZASUM
+#define CPU_IAMAX	IZAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 52 - 0
examples/lu/complex_float.h

@@ -0,0 +1,52 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <complex.h>
+#include "blas_complex.h"
+#define TYPE complex float
+#define CUBLAS_TYPE cuComplex
+
+#define STARPU_LU(name)       starpu_clu_##name
+#define COMPLEX_LU
+
+#ifdef STARPU_HAVE_MAGMA
+#include <magmablas.h>
+#define CUBLAS_GEMM	magmablas_cgemm
+#define CUBLAS_TRSM	magmablas_ctrsm
+#else
+#define CUBLAS_GEMM	cublasCgemm
+#define CUBLAS_TRSM	cublasCtrsm
+#endif
+
+#define CUBLAS_SCAL	cublasCscal
+#define CUBLAS_GER	cublasCgeru
+#define CUBLAS_SWAP	cublasCswap
+#define CUBLAS_IAMAX	cublasIcamax
+
+#define CPU_GEMM	CGEMM
+#define CPU_TRSM	CTRSM
+#define CPU_SCAL	CSCAL
+#define CPU_GER		CGERU
+#define CPU_SWAP	CSWAP
+
+#define CPU_TRMM	CTRMM
+#define CPU_AXPY	CAXPY
+#define CPU_ASUM	SCASUM
+#define CPU_IAMAX	ICAMAX
+
+#define PIVOT_THRESHHOLD	10e-5

+ 1 - 0
examples/lu/double.h

@@ -16,6 +16,7 @@
  */
  */
 
 
 #define TYPE double
 #define TYPE double
+#define CUBLAS_TYPE TYPE
 
 
 #define STARPU_LU(name)       starpu_dlu_##name
 #define STARPU_LU(name)       starpu_dlu_##name
 
 

+ 1 - 0
examples/lu/float.h

@@ -17,6 +17,7 @@
 
 
 
 
 #define TYPE float
 #define TYPE float
+#define CUBLAS_TYPE TYPE
 
 
 #define STARPU_LU(name)       starpu_slu_##name
 #define STARPU_LU(name)       starpu_slu_##name
 
 

+ 13 - 0
examples/lu/lu_example.c

@@ -164,6 +164,10 @@ static void init_matrix(void)
 		for (i = 0; i < size; i++)
 		for (i = 0; i < size; i++)
 		{
 		{
 			A[i + j*size] = (TYPE)starpu_drand48();
 			A[i + j*size] = (TYPE)starpu_drand48();
+#ifdef COMPLEX_LU
+			/* also randomize the imaginary component for complex number cases */
+			A[i + j*size] += (TYPE)(I*starpu_drand48());
+#endif
 		}
 		}
 	}
 	}
 
 
@@ -249,11 +253,20 @@ static void check_result(void)
 	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
 	CPU_AXPY(size*size, -1.0, A_saved, 1, L, 1);
 	display_matrix(L, size, size, "Residuals");
 	display_matrix(L, size, size, "Residuals");
 	
 	
+#ifdef COMPLEX_LU
+	double err = CPU_ASUM(size*size, L, 1);
+	int max = CPU_IAMAX(size*size, L, 1);
+	TYPE l_max = L[max];
+
+	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
+	FPRINTF(stderr, "Max error : %e\n", sqrt(creal(l_max)*creal(l_max)+cimag(l_max)*cimag(l_max)));
+#else
 	TYPE err = CPU_ASUM(size*size, L, 1);
 	TYPE err = CPU_ASUM(size*size, L, 1);
 	int max = CPU_IAMAX(size*size, L, 1);
 	int max = CPU_IAMAX(size*size, L, 1);
 
 
 	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
 	FPRINTF(stderr, "Avg error : %e\n", err/(size*size));
 	FPRINTF(stderr, "Max error : %e\n", L[max]);
 	FPRINTF(stderr, "Max error : %e\n", L[max]);
+#endif
 
 
 	double residual = frobenius_norm(L, size);
 	double residual = frobenius_norm(L, size);
 	double matnorm = frobenius_norm(A_saved, size);
 	double matnorm = frobenius_norm(A_saved, size);

+ 19 - 0
examples/lu/lu_example_complex_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "lu_example.c"

+ 19 - 0
examples/lu/lu_example_complex_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_float.h"
+#include "lu_example.c"

+ 28 - 18
examples/lu/xlu_kernels.c

@@ -22,6 +22,11 @@
 #define xstr(s)        str(s)
 #define xstr(s)        str(s)
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
 #define STARPU_LU_STR(name)  xstr(STARPU_LU(name))
 
 
+#ifdef STARPU_USE_CUDA
+static const TYPE p1 =  1.0f;
+static const TYPE m1 = -1.0f;
+#endif
+
 /*
 /*
  *   U22 
  *   U22 
  */
  */
@@ -54,10 +59,10 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			break;
 			break;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-		case 1:
+		case 1: {
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
 			CUBLAS_GEMM('n', 'n', dx, dy, dz,
-				(TYPE)-1.0, right, ld21, left, ld12,
-				(TYPE)1.0f, center, ld22);
+				*(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
+				*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
@@ -67,6 +72,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
 			break;
 			break;
+		}
 #endif
 #endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
@@ -140,7 +146,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
 			CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
-					(TYPE)1.0, sub11, ld11, sub12, ld12);
+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
@@ -221,7 +227,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		case 1:
 		case 1:
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
 			CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
-					(TYPE)1.0, sub11, ld11, sub21, ld21);
+					*(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
 
 
 			status = cublasGetError();
 			status = cublasGetError();
 			if (status != CUBLAS_STATUS_SUCCESS)
 			if (status != CUBLAS_STATUS_SUCCESS)
@@ -307,17 +313,19 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
+				TYPE inv_pivot;
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaStreamSynchronize(0);
 				cudaStreamSynchronize(0);
 
 
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
 				
 				
-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				inv_pivot = 1.0/pivot;
+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
 				
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
-						&sub11[(z+1)+z*ld], 1,
-						&sub11[z+(z+1)*ld], ld,
-						&sub11[(z+1) + (z+1)*ld],ld);
+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			}
 			
 			
 			cudaThreadSynchronize();
 			cudaThreadSynchronize();
@@ -423,20 +431,21 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			for (z = 0; z < nx; z++)
 			for (z = 0; z < nx; z++)
 			{
 			{
 				TYPE pivot;
 				TYPE pivot;
+				TYPE inv_pivot;
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 				cudaStreamSynchronize(0);
 				cudaStreamSynchronize(0);
 
 
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				{
 				{
 					/* find the pivot */
 					/* find the pivot */
-					int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
+					int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
 	
 	
 					ipiv[z + first] = piv_ind + z + first;
 					ipiv[z + first] = piv_ind + z + first;
 
 
 					/* swap if needed */
 					/* swap if needed */
 					if (piv_ind != 0)
 					if (piv_ind != 0)
 					{
 					{
-						CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
+						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
 					}
 					}
 
 
 					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
 					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
@@ -445,12 +454,13 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 
 
 				STARPU_ASSERT(pivot != 0.0);
 				STARPU_ASSERT(pivot != 0.0);
 				
 				
-				CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
+				inv_pivot = 1.0/pivot;
+				CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
 				
 				
-				CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
-						&sub11[(z+1)+z*ld], 1,
-						&sub11[z+(z+1)*ld], ld,
-						&sub11[(z+1) + (z+1)*ld],ld);
+				CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
+						(CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
+						(CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
+						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 				
 				
 			}
 			}
 
 
@@ -534,7 +544,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				unsigned rowpiv = ipiv[row+first] - first;
 				unsigned rowpiv = ipiv[row+first] - first;
 				if (rowpiv != row)
 				if (rowpiv != row)
 				{
 				{
-					CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
+					CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
 				}
 				}
 			}
 			}
 
 

+ 19 - 0
examples/lu/zlu.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu.c"

+ 19 - 0
examples/lu/zlu_implicit.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_implicit.c"

+ 19 - 0
examples/lu/zlu_implicit_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_implicit_pivot.c"

+ 19 - 0
examples/lu/zlu_kernels.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_kernels.c"

+ 19 - 0
examples/lu/zlu_pivot.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "complex_double.h"
+#include "xlu_pivot.c"

+ 19 - 1
gcc-plugin/examples/Makefile.am

@@ -14,12 +14,30 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 
 noinst_PROGRAMS =				\
 noinst_PROGRAMS =				\
-  matrix-mult
+  matrix-mult stencil5
+
+if !NO_BLAS_LIB
+noinst_PROGRAMS +=				\
+  cholesky/cholesky
+endif
 
 
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
 AM_LDFLAGS = $(top_builddir)/src/libstarpu.la
 
 
 AM_CPPFLAGS =						\
 AM_CPPFLAGS =						\
   -I$(top_srcdir)/include				\
   -I$(top_srcdir)/include				\
+  -I$(top_srcdir)/examples				\
   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
   $(STARPU_OPENCL_CPPFLAGS) $(STARPU_CUDA_CPPFLAGS)
 
 
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
 AM_CFLAGS = -fplugin="$(builddir)/../src/.libs/starpu.so" -Wall
+
+if !NO_BLAS_LIB
+cholesky_cholesky_SOURCES	=		\
+	cholesky/cholesky.c		\
+	cholesky/cholesky_models.c	\
+	cholesky/cholesky_kernels.c	\
+	$(top_srcdir)/examples/common/blas.c
+
+cholesky_cholesky_LDADD	=	\
+	$(STARPU_BLAS_LDFLAGS)
+endif
+

+ 127 - 0
gcc-plugin/examples/stencil5.c

@@ -0,0 +1,127 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <math.h>
+
+#ifndef STARPU_GCC_PLUGIN
+# error must be compiled with the StarPU GCC plug-in
+#endif
+
+/* Definition of the StarPU task and its CPU implementation.  */
+static void stencil5(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+	__attribute__ ((task));
+
+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+	__attribute__ ((task_implementation ("cpu", stencil5)));
+
+static void stencil5_cpu(float *xy, const float *xm1y, const float *xp1y, const float *xym1, const float *xyp1)
+{
+	*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
+}
+
+#define NITER_DEF 20000
+#define X         10
+#define Y         10
+
+int display = 0;
+int niter = NITER_DEF;
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-iter") == 0) {
+			char *argptr;
+			niter = strtol(argv[++i], &argptr, 10);
+		}
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+	}
+}
+
+static float my_rand (void)
+{
+	return (float) rand () / (float) RAND_MAX;
+}
+
+int main(int argc, char **argv)
+{
+        int x, y;
+        float mean=0;
+        float matrix[X][Y];
+
+        parse_args(argc, argv);
+
+	srand (time (NULL));
+        for(x = 0; x < X; x++) {
+                for (y = 0; y < Y; y++) {
+                        matrix[x][y] = my_rand () * 100;
+                        mean += matrix[x][y];
+                }
+        }
+        mean /= (x*y);
+
+        if (display) {
+                fprintf(stdout, "mean=%f\n", mean);
+                for(x = 0; x < X; x++) {
+                        for (y = 0; y < Y; y++) {
+                                fprintf(stdout, "%3f ", matrix[x][y]);
+                        }
+                        fprintf(stdout, "\n");
+                }
+        }
+
+#pragma starpu initialize
+
+        for(x = 0; x < X; x++) {
+		for (y = 0; y < Y; y++) {
+#pragma starpu register &matrix[x][y] 1
+		}
+	}
+
+	while(niter--) {
+                for (x = 1; x < X-1; x++) {
+                        for (y = 1; y < Y-1; y++) {
+                                stencil5(&matrix[x][y], &matrix[x-1][y], &matrix[x+1][y],
+					 &matrix[x][y-1], &matrix[x][y+1]);
+                        }
+                }
+        }
+
+#pragma starpu wait
+
+        for(x = 0; x < X; x++) {
+                for (y = 0; y < Y; y++) {
+#pragma starpu unregister &matrix[x][y]
+                }
+        }
+
+#pragma starpu shutdown
+
+        if (display) {
+                fprintf(stdout, "mean=%f\n", mean);
+                for(x = 0; x < X; x++) {
+                        for (y = 0; y < Y; y++) {
+                                fprintf(stdout, "%3f ", matrix[x][y]);
+                        }
+                        fprintf(stdout, "\n");
+                }
+        }
+
+	return EXIT_SUCCESS;
+}

+ 6 - 0
gcc-plugin/src/starpu.c

@@ -690,6 +690,12 @@ handle_task_attribute (tree *node, tree name, tree args,
 	      "%<task%> attribute only applies to functions");
 	      "%<task%> attribute only applies to functions");
   else
   else
     {
     {
+      if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (fn))))
+	/* Raise an error but keep going to avoid spitting out too many
+	   errors at the user's face.  */
+	error_at (DECL_SOURCE_LOCATION (fn),
+		  "task return type must be %<void%>");
+
       /* This is a function declaration for something local to this
       /* This is a function declaration for something local to this
 	 translation unit, so add the `task' attribute to FN.  */
 	 translation unit, so add the `task' attribute to FN.  */
       *no_add_attrs = false;
       *no_add_attrs = false;

+ 9 - 4
gcc-plugin/tests/run-test.in

@@ -261,10 +261,15 @@ unsatisfied directives."
 
 
 (define (executable-file source)
 (define (executable-file source)
   "Return the name of the executable file corresponding to SOURCE."
   "Return the name of the executable file corresponding to SOURCE."
-  (let ((dot (string-rindex source #\.)))
-    (if dot
-        (substring source 0 dot)
-        (string-append source ".exe"))))
+  (let* ((dot (string-rindex source #\.))
+         (exe (if dot
+                  (substring source 0 dot)
+                  (string-append source ".exe")))
+         )
+  (if (string-prefix? %srcdir exe)
+      (string-append %builddir (substring exe (string-length %srcdir)))
+      exe
+      )))
 
 
 (define (compile/match file cc cflags ldflags)
 (define (compile/match file cc cflags ldflags)
   "Read directives from FILE, and compiler/link/run it.  Make sure directives
   "Read directives from FILE, and compiler/link/run it.  Make sure directives

+ 3 - 0
gcc-plugin/tests/task-errors.c

@@ -49,6 +49,9 @@ static void my_task_wrong_target_arg (int foo, char *bar) /* (error "string cons
 static void my_task_with_a_body (int foo, char *bar)
 static void my_task_with_a_body (int foo, char *bar)
   __attribute__ ((task, unused));
   __attribute__ ((task, unused));
 
 
+extern int my_task_not_void (int foo) /* (error "return type") */
+  __attribute__ ((task));
+
 
 
 static void
 static void
 my_task_cpu (int foo, float *bar)
 my_task_cpu (int foo, float *bar)

+ 5 - 7
include/starpu_scheduler.h

@@ -78,15 +78,13 @@ struct starpu_sched_policy_s {
 
 
 	/* Insert a task into the scheduler. */
 	/* Insert a task into the scheduler. */
         int (*push_task)(struct starpu_task *, unsigned);
         int (*push_task)(struct starpu_task *, unsigned);
-	/* Notify the scheduler that a task was pushed on the worker. This
-	 * method is called when a task that was explicitely assigned to a
-	 * worker is scheduled. This method therefore permits to keep the state
-	 * of of the scheduler coherent even when StarPU bypasses the
-	 * scheduling strategy. */
+	/* Notify the scheduler that a task was directly pushed to the worker
+	 * without going through the scheduler. This method is called when a
+	 * task is explicitely assigned to a worker. This method therefore
+	 * permits to keep the timing state of the scheduler coherent even
+	 * when StarPU bypasses the scheduling strategy. */
 	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned);
 	void (*push_task_notify)(struct starpu_task *, int workerid, unsigned);
 
 
-	/* Insert a priority task into the scheduler. */
-        int (*push_prio_task)(struct starpu_task *, unsigned);
 
 
 	/* Get a task from the scheduler. The mutex associated to the worker is
 	/* Get a task from the scheduler. The mutex associated to the worker is
 	 * already taken when this method is called. */
 	 * already taken when this method is called. */

+ 7 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -429,12 +429,12 @@ static void benchmark_all_gpu_devices(void)
 #endif
 #endif
 
 
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = config->topology.ncpus;
 
 
 	/* TODO: measure bandwidth between GPU-GPU */
 	/* TODO: measure bandwidth between GPU-GPU */
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (i = 0; i < ncuda; i++)
 	for (i = 0; i < ncuda; i++)
 	{
 	{
 		fprintf(stderr," CUDA %d...", i);
 		fprintf(stderr," CUDA %d...", i);
@@ -481,7 +481,7 @@ static void get_bus_path(const char *type, char *path, size_t maxlen)
 	char hostname[32];
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 	else
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, ".", maxlen);
@@ -509,11 +509,11 @@ static void load_bus_affinity_file_content(void)
 
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
-	ncpus = _starpu_topology_get_nhwcpu(config);
+	ncpus = config->topology.ncpus;
         int gpu;
         int gpu;
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-        cudaGetDeviceCount(&ncuda);
+	ncuda = _starpu_get_cuda_device_count();
 	for (gpu = 0; gpu < ncuda; gpu++)
 	for (gpu = 0; gpu < ncuda; gpu++)
 	{
 	{
 		int ret;
 		int ret;
@@ -1020,9 +1020,9 @@ static void check_bus_config_file()
                 fclose(f);
                 fclose(f);
 
 
                 // Loading current configuration
                 // Loading current configuration
-                ncpus = _starpu_topology_get_nhwcpu(config);
+                ncpus = config->topology.ncpus;
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
-                cudaGetDeviceCount(&ncuda);
+		ncuda = _starpu_get_cuda_device_count();
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();
                 nopencl = _starpu_opencl_get_device_count();

+ 219 - 14
src/core/perfmodel/perfmodel_history.c

@@ -204,17 +204,117 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel_
 	}
 	}
 }
 }
 
 
-static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
+static void parse_arch(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history, unsigned archmin, unsigned archmax, int skiparch)
 {
 {
-	unsigned arch;
-	unsigned nimpl;
-	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++) {
-		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++) {
-			parse_per_arch_model_file(f, &model->per_arch[arch][nimpl], scan_history);
+	unsigned arch, impl;
+	struct starpu_per_arch_perfmodel_t dummy;
+	int nimpls, implmax, skipimpl;
+	unsigned ret;
+	
+
+	for (arch = archmin; arch < archmax; arch++) {
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &nimpls);
+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
+		for (impl = 0; impl < implmax; impl++) {
+			parse_per_arch_model_file(f, &model->per_arch[arch][impl], scan_history);
+		}
+		if (skipimpl > 0) {
+			for (impl = 0; impl < skipimpl; impl++) {
+				parse_per_arch_model_file(f, &dummy, 0);
+			}
+		}
+	}
+
+	if (skiparch > 0) {
+		_starpu_drop_comments(f);
+		ret = fscanf(f, "%d\n", &nimpls);
+		implmax = STARPU_MIN(nimpls, STARPU_MAXIMPLEMENTATIONS);
+		skipimpl = nimpls - STARPU_MAXIMPLEMENTATIONS;
+		for (arch = 0; arch < skiparch; arch ++) {
+			for (impl = 0; impl < implmax; impl++) {
+				parse_per_arch_model_file(f, &dummy, 0);
+			}
+			if (skipimpl > 0) {
+				for (impl = 0; impl < skipimpl; impl++) {
+					parse_per_arch_model_file(f, &dummy, 0);
+				}
+			}
 		}
 		}
 	}
 	}
 }
 }
 
 
+static void parse_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned scan_history)
+{
+	unsigned ret;
+	unsigned archmin = 0;
+	unsigned max_gordondevs = 1; /* XXX : we need a STARPU_MAXGORDONDEVS cst */
+	unsigned narchs;
+	int nimpls;
+
+	/* We could probably write a clean loop here, but the code would not
+	 * really be easier to read. */
+
+	/* Parsing CPUs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u CPUs\n", narchs);
+	if (narchs > 0)
+	{
+		parse_arch(f, model, scan_history,
+				archmin,
+				STARPU_MIN(narchs, STARPU_MAXCPUS),
+				narchs - STARPU_MAXCPUS);
+	}
+
+	/* Parsing CUDA devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+	_STARPU_DEBUG("Parsing %u CUDA devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXCPUS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + STARPU_MIN(narchs, STARPU_MAXCUDADEVS),
+				narchs - STARPU_MAXCUDADEVS);
+	}
+
+	/* Parsing OpenCL devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u OpenCL devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXCUDADEVS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
+				narchs - STARPU_MAXOPENCLDEVS);
+	}
+
+	/* Parsing Gordon implementations */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	_STARPU_DEBUG("Parsing %u Gordon devices\n", narchs);
+	if (narchs > 0)
+	{
+		archmin += STARPU_MAXOPENCLDEVS;
+		parse_arch(f, model, scan_history,
+				archmin,
+				archmin + max_gordondevs,
+				narchs - max_gordondevs);
+	}
+}
+
 
 
 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model, unsigned arch, unsigned nimpl)
 {
 {
@@ -235,7 +335,12 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 		}
 		}
 	}
 	}
 
 
+	if (nentries == 0)
+		return;
 	/* header */
 	/* header */
+	char archname[32];
+	starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
+	fprintf(f, "# Model for %s\n", archname);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 	fprintf(f, "# number of entries\n%u\n", nentries);
 
 
 	dump_reg_model(f, model, arch, nimpl);
 	dump_reg_model(f, model, arch, nimpl);
@@ -250,23 +355,123 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel_t *model,
 			ptr = ptr->next;
 			ptr = ptr->next;
 		}
 		}
 	}
 	}
+
+	fprintf(f, "\n##################\n");
 }
 }
 
 
-static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
+static unsigned get_n_entries(struct starpu_perfmodel_t *model, unsigned arch, unsigned impl)
 {
 {
-	fprintf(f, "#################\n");
+	struct starpu_per_arch_perfmodel_t *per_arch_model;
+	per_arch_model = &model->per_arch[arch][impl];
+	/* count the number of elements in the lists */
+	struct starpu_history_list_t *ptr = NULL;
+	unsigned nentries = 0;
+
+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
+	{
+		/* Dump the list of all entries in the history */
+		ptr = per_arch_model->list;
+		while(ptr) {
+			nentries++;
+			ptr = ptr->next;
+		}
+	}
+	return nentries;
+}
 
 
+static void dump_model_file(FILE *f, struct starpu_perfmodel_t *model)
+{
+	unsigned number_of_archs[4] = { 0, 0, 0, 0};
 	unsigned arch;
 	unsigned arch;
 	unsigned nimpl;
 	unsigned nimpl;
+	unsigned idx = 0;
+
+	/* Finding the number of archs to write for each kind of device */
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	{
 	{
+		switch (arch)
+		{
+			case STARPU_CUDA_DEFAULT:
+			case STARPU_OPENCL_DEFAULT:
+			case STARPU_GORDON_DEFAULT:
+				idx++;
+				break;
+			default:
+				break;
+		}
+
+		unsigned nentries = 0;
+		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
+		{
+			nentries = get_n_entries(model, arch, nimpl) != 0;
+			if (nentries > 0)
+			{
+				number_of_archs[idx]++;
+				break;
+			}
+		}
+	}
+
+	/* Writing stuff */
+	char *name;
+	unsigned substract_to_arch = 0;
+	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
+	{
+		switch (arch)
+		{
+			case STARPU_CPU_DEFAULT:
+				name = "CPU";
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[0]);
+				break;
+			case STARPU_CUDA_DEFAULT:
+				name = "CUDA";
+				substract_to_arch = STARPU_MAXCPUS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[1]);
+				break;
+			case STARPU_OPENCL_DEFAULT:
+				name = "OPENCL";
+				substract_to_arch += STARPU_MAXCUDADEVS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[2]);
+				break;
+			case STARPU_GORDON_DEFAULT:
+				name = "GORDON";
+				substract_to_arch += STARPU_MAXOPENCLDEVS;
+				fprintf(f, "##################\n");
+				fprintf(f, "# %ss\n", name);
+				fprintf(f, "# number of %s architectures\n", name);
+				fprintf(f, "%u\n", number_of_archs[3]);
+				break;
+			default:
+				break;
+		}
+
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 		{
 		{
-			char archname[32];
-			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, nimpl);
-			fprintf(f, "# Model for %s\n", archname);
+			if (get_n_entries(model, arch, nimpl) == 0)
+				break;
+
+		}
+		unsigned max_impl = nimpl;
+
+		if (max_impl == 0)
+			continue;
+
+		fprintf(f, "###########\n");
+		fprintf(f, "# %s_%u\n", name, arch - substract_to_arch);
+		fprintf(f, "# number of implementations\n");
+		fprintf(f, "%u\n", max_impl);
+		for (nimpl = 0; nimpl < max_impl; nimpl++)
+		{
 			dump_per_arch_model_file(f, model, arch, nimpl);
 			dump_per_arch_model_file(f, model, arch, nimpl);
-			fprintf(f, "\n##################\n");
 		}
 		}
 	}
 	}
 }
 }
@@ -300,7 +505,7 @@ static void get_model_debug_path(struct starpu_perfmodel_t *model, const char *a
 	char hostname[32];
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 	else
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, ".", maxlen);
@@ -351,7 +556,7 @@ static void get_model_path(struct starpu_perfmodel_t *model, char *path, size_t
 	char hostname[32];
 	char hostname[32];
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	char *forced_hostname = getenv("STARPU_HOSTNAME");
 	if (forced_hostname && forced_hostname[0])
 	if (forced_hostname && forced_hostname[0])
-		snprintf(hostname, sizeof(hostname), forced_hostname);
+		snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
 	else
 	else
 		gethostname(hostname, sizeof(hostname));
 		gethostname(hostname, sizeof(hostname));
 	strncat(path, ".", maxlen);
 	strncat(path, ".", maxlen);

+ 1 - 1
src/core/workers.c

@@ -87,7 +87,7 @@ static int _starpu_may_use_nth_implementation(enum starpu_archtype arch, struct
 			cl->opencl_funcs[nimpl] == NULL);
 			cl->opencl_funcs[nimpl] == NULL);
 	case STARPU_GORDON_WORKER:
 	case STARPU_GORDON_WORKER:
 		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
 		return !(cl->gordon_func == STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS &&
-			cl->gordon_funcs[nimpl] == NULL);
+			cl->gordon_funcs[nimpl] == 0);
 	default:
 	default:
 		return 0;
 		return 0;
 	}
 	}

+ 5 - 1
src/drivers/opencl/driver_opencl.c

@@ -357,6 +357,11 @@ void _starpu_opencl_init(void)
                 // Get location of OpenCl kernel source files
                 // Get location of OpenCl kernel source files
                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
                 _starpu_opencl_program_dir = getenv("STARPU_OPENCL_PROGRAM_DIR");
 
 
+		if (nb_devices > STARPU_MAXOPENCLDEVS) {
+			_STARPU_DISP("# Warning: %d OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
+			nb_devices = STARPU_MAXOPENCLDEVS;
+		}
+
                 // initialise internal structures
                 // initialise internal structures
                 for(i=0 ; i<nb_devices ; i++) {
                 for(i=0 ; i<nb_devices ; i++) {
                         contexts[i] = NULL;
                         contexts[i] = NULL;
@@ -538,7 +543,6 @@ static int _starpu_opencl_execute_job(starpu_job_t j, struct starpu_worker_s *ar
 
 
 	struct timespec codelet_start, codelet_end;
 	struct timespec codelet_start, codelet_end;
 
 
-	int workerid = args->workerid;
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task);
 	struct starpu_codelet_t *cl = task->cl;
 	struct starpu_codelet_t *cl = task->cl;
 	STARPU_ASSERT(cl);
 	STARPU_ASSERT(cl);

+ 3 - 1
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -478,6 +478,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 					) {
 					) {
 				ntasks_best_end = ntasks_end;
 				ntasks_best_end = ntasks_end;
 				ntasks_best = worker;
 				ntasks_best = worker;
+
 			}
 			}
 
 
 			if (local_task_length[worker_in_ctx] == -1.0)
 			if (local_task_length[worker_in_ctx] == -1.0)
@@ -492,7 +493,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 				unknown = 1;
 				unknown = 1;
 
 
 			if (unknown)
 			if (unknown)
-				continue;
+					continue;
 
 
 			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
 			exp_end[worker_in_ctx] = fifo->exp_start + fifo->exp_len + local_task_length[worker_in_ctx];
 
 
@@ -501,6 +502,7 @@ static int _dmda_push_task(struct starpu_task *task, unsigned prio, struct starp
 				/* a better solution was found */
 				/* a better solution was found */
 				best_exp_end = exp_end[worker_in_ctx];
 				best_exp_end = exp_end[worker_in_ctx];
 				best_impl = nimpl;
 				best_impl = nimpl;
+
 			}
 			}
 
 
 			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);
 			local_power[worker_in_ctx] = starpu_task_expected_power(task, perf_arch, nimpl);

+ 410 - 106
src/sched_policies/detect_combined_workers.c

@@ -21,150 +21,454 @@
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
 #include <hwloc.h>
-#endif
 
 
-#ifdef STARPU_HAVE_HWLOC
-/* This function returns 1 the subtree induced by obj only contains CPU
- * workers, otherwise 0 is returned. This function registers all valid worker
- * combination below obj. The id of the CPU workers are put in the worker_array
- * and their count is put in the worker_cnt pointer. */
-static int find_combinations_with_hwloc_rec(hwloc_obj_t obj, int *worker_array, int *worker_cnt)
+/* tree_t
+ * ======
+ * Purpose
+ * =======
+ * Structure representing a tree (which can be a sub-tree itself) whose root is an hwloc
+ * object and storing every workers it contained in every sub-trees by recursion.
+ *
+ * Fields
+ * ======
+ * obj			A hwloc object which can be a root or a leaf, it may be a numa node, a cache memory or a CPU, etc...
+ *
+ * nb_workers		Number of CPU workers which can be found by recursion in all the sub-trees beneath this one
+ 			or in this very object.
+ *
+ * workers		CPU-workers found by recursion in all the sub-trees and in this very one, represented as leaves in hwloc.
+ */
+
+typedef struct tree_s{
+    hwloc_obj_t obj;
+    unsigned nb_workers;
+    int *workers;
+} tree_t;
+
+
+/* gather_trees
+ * ============
+ * Purpose
+ * =======
+ * Gather all the workers of every source tree in one target tree.
+ * We assume the target array of workers is big enough to contain all the workers.
+ *
+ * Arguments
+ * =========
+ * target_tree		(input, output)
+ *			Pointer to the tree which will contain all the workers of every source.
+ *
+ * source_trees		(input)
+ *			Array of trees we want to combine in a unique tree.
+ *
+ * nb_source_trees	(input)
+ *			Number of trees we want to combine (size of the array).
+ */
+
+static void gather_trees(tree_t *target_tree, tree_t *source_trees, unsigned nb_source_trees)
+{
+    unsigned tree_id, worker_id, index = 0;
+    for(tree_id = 0; tree_id < nb_source_trees; ++tree_id)
+	for(worker_id = 0; worker_id < source_trees[tree_id].nb_workers; ++worker_id)
+	    target_tree->workers[index++] = source_trees[tree_id].workers[worker_id];
+}
+
+/* assign_multiple_trees
+ * ========================
+ * Purpose
+ * =======
+ * Assign every tree which is large enough (greater than min_size) and merge small ones.
+ * If there is no tree large enough to be assigned any more, we return.
+ *
+ * Return value
+ * ============
+ * The number of workers assigned during the function.
+ *
+ * Arguments
+ * =========
+ * trees		(input, output)
+ *			In entry, array of trees to assign. In the end at most one tree still contains workers.
+ *
+ * nb_trees		(input)
+ *			The number of trees (size of the array).
+ *
+ * min_size		(input)
+ *			Minimum size of a combined worker.
+ *
+ * max_size		(input)
+ *			Maximum size of a combined worker.
+ */
+
+static unsigned assign_multiple_trees(tree_t *trees, unsigned nb_trees, int min_size, int max_size)
 {
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    unsigned short complete = 0;
+    unsigned tree_id, tree_id2, nb_workers_tree, nb_workers_tree2, worker_id, nb_workers_total = 0, nb_workers_assigned = 0;
+
+    for(tree_id = 0; tree_id < nb_trees; ++tree_id)
+	nb_workers_total += trees[tree_id].nb_workers;;
 
 
-	/* Is this a leaf ? (eg. a PU for hwloc) */
-	int is_leaf = !hwloc_compare_types(config->cpu_depth, obj->depth);
+    while(!complete)
+    {
+	complete = 1;
 
 
-	if (is_leaf)
+	/* First we manage to assign every subtree large enough to be assigned alone */
+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
 	{
 	{
-		struct starpu_worker_s *worker = obj->userdata;
+	    if(trees[tree_id].nb_workers== 0) // An already assigned subtree
+		continue;
 
 
-		/* If this is a CPU worker, append its id at the end of the
-		 * list */
-		if (worker && worker->arch == STARPU_CPU_WORKER)
+	    nb_workers_tree = trees[tree_id].nb_workers;
+
+	    /* We shouldn't assign a small tree if we could assign the whole trees instead */
+	    if(nb_workers_tree >= min_size && nb_workers_total > max_size)
+	    {
+		int ret = starpu_combined_worker_assign_workerid(nb_workers_tree, trees[tree_id].workers);
+		STARPU_ASSERT(ret >= 0);
+		nb_workers_assigned += nb_workers_tree;
+		nb_workers_total -= nb_workers_tree;
+		trees[tree_id].nb_workers = 0;
+	    }
+	}
+
+	/* Then we merge too small subtrees into not too large ones
+	 * if we manage to merge some subtrees we turn the flag
+	 * complete to 0 thus we know he have to start again to assign
+	 * just merged subtrees */
+	for(tree_id = 0; tree_id < nb_trees; ++tree_id)
+	{
+	    if(trees[tree_id].nb_workers == 0) // An already assigned subtree
+		continue;
+
+	    nb_workers_tree = trees[tree_id].nb_workers;
+
+	    /* We go through the array to find another subtree we can merge with this one */
+	    for(tree_id2 = 0; tree_id2 < nb_trees; ++tree_id2)
+	    {
+		if(trees[tree_id2].nb_workers == 0 || tree_id == tree_id2) // An already assigned subtree or the same
+		    continue;
+
+		nb_workers_tree2 = trees[tree_id2].nb_workers;
+
+		/*  We can merge the two subtrees, let's do it */
+		if(nb_workers_tree + nb_workers_tree2 <= max_size)
 		{
 		{
-			worker_array[*worker_cnt] = worker->workerid;
-			*worker_cnt = *worker_cnt + 1;
+		    for(worker_id = 0; worker_id < nb_workers_tree2; ++worker_id)
+			trees[tree_id].workers[nb_workers_tree + worker_id] = trees[tree_id2].workers[worker_id];
+
+		    trees[tree_id].nb_workers += nb_workers_tree2;
+		    trees[tree_id2].nb_workers = 0;
+
+		    /* We just merged two subtrees, we need to restart again and try to assign it */
+		    complete = 0;
+		    break;
 		}
 		}
+	    }
 
 
-		/* We cannot create a combined worker only if there is a CPU
-		 * worker. */
-		return (!worker || worker->arch == STARPU_CPU_WORKER);
+	    if(!complete)
+		break;
 	}
 	}
+    }
 
 
-	/* If there is only one child, we go to the next level directly */
-	if (obj->arity == 1)
-		return find_combinations_with_hwloc_rec(obj->children[0], worker_array, worker_cnt);
+    return nb_workers_assigned;
+}
 
 
-	/* We recursively go from the root to the leaves of the tree to find
-	 * subtrees that only have CPUs as leaves. */
-	unsigned cpu_children_cnt = 0;
+/* find_and_assign_combinations_with_hwloc_recursive
+ * =================================================
+ * Purpose
+ * =======
+ * Go through the tree given as parameter and try to assign them. Workers it didn't succeed to
+ * assign are given back to the calling function to be assigned using data from other subtrees if so.
+ *
+ * Return value
+ * ============
+ * The number of workers left to be assigned.
+ *
+ * Arguments
+ * =========
+ * tree			(input, output)
+ *			Tree structure containing the root to process in entry.
+ *			When the function returns it also contains the number of workers left
+ *			to be assigned and these very workers in the array previously allocated.
+ *
+ * min_size		(input)
+ *			Minimum size of a combined worker.
+ *
+ * max_size		(input)
+ *			Maximum size of a combined worker.
+ */
+
+static unsigned find_and_assign_combinations_with_hwloc_recursive(tree_t *tree, int min_size, int max_size)
+{
+    unsigned subtree_id, nb_workers = 0;
 
 
-	int worker_array_rec[STARPU_NMAXWORKERS];
-	int worker_cnt_rec = 0;
-	memset(worker_array_rec, 0, sizeof(int)*STARPU_NMAXWORKERS);
+    hwloc_obj_t obj = tree->obj;
+    int *workers = tree->workers;
 
 
-	unsigned i;
-	for (i = 0; i < obj->arity; i++)
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
+
+    /* Is this a leaf ? (eg. a PU for hwloc) */
+    if (!hwloc_compare_types(config->cpu_depth, obj->depth))
+    {
+	struct starpu_worker_s *worker = obj->userdata;
+
+	/* If this is a CPU worker add it at the beginning
+	 * of the array , write 1 in the field nb_workers and
+	 * return the number of CPU workers found : 1 in this case. */
+	if (worker && worker->arch == STARPU_CPU_WORKER)
 	{
 	{
-		int valid_subtree = find_combinations_with_hwloc_rec(obj->children[i],
-						worker_array_rec, &worker_cnt_rec);
-		if (valid_subtree)
-			cpu_children_cnt++;
+	    workers[0] = worker->workerid;
+	    tree->nb_workers = 1;
+	    return 1;
 	}
 	}
 
 
-	int child;
+	tree->nb_workers = 0;
+	return 0;
+    }
+
+
+    /* If there is only one child, we go to the next level right away */
+    if (obj->arity == 1)
+    {
+	tree_t subtree = *tree;
+	subtree.obj = obj->children[0];
+	nb_workers = find_and_assign_combinations_with_hwloc_recursive(&subtree, min_size, max_size);
+	tree->nb_workers = nb_workers;
+	return nb_workers;
+    }
+
+    /* We recursively go to the leaves of the tree to find subtrees which have the biggest number of
+     * CPU leaves that fits between min and max. */
+
+    /* We allocate an array of tree structures which will contain the current node's subtrees data */
+    tree_t *subtrees = (tree_t *) malloc(obj->arity * sizeof(tree_t));
+
+    /* We allocate the array containing the workers of each subtree and initialize the fields left */
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+    {
+	tree_t *subtree = subtrees + subtree_id;
+
+	subtree->obj = obj->children[subtree_id];
+	subtree->nb_workers = 0;
+	subtree->workers = (int *) malloc(config->topology.nhwcpus * sizeof(int));
+    }
+
+    /* We recursively go through every subtree and get all the workers which are not assigned yet */
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+	nb_workers += find_and_assign_combinations_with_hwloc_recursive(subtrees + subtree_id, min_size, max_size);
+
+    if(nb_workers > max_size)
+    {
+	/* We withdraw the number of workers just assigned from the total number of workers */
+	nb_workers -= assign_multiple_trees(subtrees, obj->arity, min_size, max_size);
 
 
-	if (cpu_children_cnt == obj->arity)
-	for (child = 0; child < worker_cnt_rec; child++)
+	/* Some workers are not assigned yet : we gather them in the array
+	 * which is returned to the father which will handle them later */
+	if(nb_workers)
+	    gather_trees(tree, subtrees, obj->arity);
+    }
+    else if(nb_workers < max_size)
+    {
+	gather_trees(tree, subtrees, obj->arity);
+    }
+    else // nb_workers == max_size
+    {
+	gather_trees(tree, subtrees, obj->arity);
+
+	int ret = starpu_combined_worker_assign_workerid(nb_workers, workers);
+	STARPU_ASSERT(ret >= 0);
+	nb_workers = 0;
+    }
+
+    for(subtree_id = 0; subtree_id < obj->arity; ++subtree_id)
+	free(subtrees[subtree_id].workers);
+    free(subtrees);
+
+    tree->nb_workers = nb_workers;
+    return nb_workers;
+}
+
+/* get_min_max_sizes
+ * =================================================
+ * Purpose
+ * =======
+ * First, try to get the value from the STARPU_MIN_WORKERSIZE and STARPU_MAX_WORKERSIZE
+ * environment variables.
+ * If both of them were not set, then we try do get some efficient values following the rule beneath :
+ *
+ * 				-->   exact 	-->  MIN_SIZE = S-1 <--> MAX_SIZE = S+1
+ * S = square_root(nb_cpus)
+ *				-->   decimal 	-->  MIN_SIZE = truncation(S) <--> MAX_SIZE = rounding_up(S)
+ *
+ * If only one of both was not set then we set it with a value relative to the other, for example :
+ *
+ *		 	MIN_SIZE = MAX_SIZE - 1 or MAX_SIZE = MIN_SIZE + 1
+ *
+ * Arguments
+ * =========
+ * min_size		(output)
+ *			Pointer to the minimum size of a combined worker, whether set with
+ *			value given by the user or processed from the number of cpus.
+ *
+ * max_size		(output)
+ *			Pointer to the maximum size of a combined worker, whether set with
+ *			value given by the user or processed from the number of cpus.
+ *
+ * topology		(input)
+ *			Topology of the machine : used to know the number of cpus.
+ */
+
+static void get_min_max_sizes(int *min_size, int *max_size, struct starpu_machine_topology_s *topology)
+{
+    int _min_size, _max_size;
+
+    _min_size = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+    _max_size = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
+
+    /* If the user didn't set both the environment variables,
+     * we need to find a minimum and a maximum size ourselves */
+    if(_min_size <= -1 || _max_size <= -1)
+    {
+
+	int nb_cpus = topology->nhwcpus;
+	int sqrt_nb_cpus = sqrt(nb_cpus);
+	short exact = (sqrt_nb_cpus * sqrt_nb_cpus == nb_cpus);
+
+	    if(_min_size == -1)
+	    {
+		if(_max_size > -1)
+		    _min_size = _max_size - 1;
+		else
+		    _min_size = exact ? sqrt_nb_cpus - 1 : sqrt_nb_cpus;
+	    }
+
+	if(_max_size == -1)
 	{
 	{
-		worker_array[*worker_cnt] = worker_array_rec[child];
-		*worker_cnt = *worker_cnt + 1;
+	    if(_min_size > -1)
+		_max_size = _min_size + 1;
+	    else
+		_max_size = sqrt_nb_cpus + 1;
 	}
 	}
-	
-	/* If there is at least 2 children that are valid, we combined them. */
-	int maxsize = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-	int minsize = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+    }
 
 
-	if (cpu_children_cnt > 1 && worker_cnt_rec > 0 && worker_cnt_rec <= maxsize && worker_cnt_rec >= minsize)
-		starpu_combined_worker_assign_workerid(worker_cnt_rec, worker_array_rec);
+    *min_size = _min_size;
+    *max_size = _max_size;
 
 
-	return (cpu_children_cnt == obj->arity);
+    return;
 }
 }
 
 
-static void find_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
+/* find_and_assign_combinations_with_hwloc
+ * =======================================
+ * Purpose
+ * =======
+ * Launches find_and_assign_combinations_with_hwloc_recursive function on the root
+ * of the hwloc tree to gather and assign combined cpu workers in an efficient manner.
+ * When find_and_assign_combinations_with_hwloc_recursive returns, if there are still
+ * some workers, we assign them no matter the number for there is no way to respect
+ * the wanted sizes anymore.
+ *
+ * Arguments
+ * =========
+ * topology		(input)
+ *			Topology of the machine : used to know the number of cpus and
+ *			to get the hwloc tree.
+ */
+
+static void find_and_assign_combinations_with_hwloc(struct starpu_machine_topology_s *topology)
 {
 {
-	/* We don't care about the result */
-	int worker_array[STARPU_NMAXWORKERS];
-	int worker_cnt = 0;
-
-	/* We recursively go from the root to the leaves of the tree to find
-	 * subtrees that only have CPUs as leaves. */
-	hwloc_obj_t root;
-	root = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
-	find_combinations_with_hwloc_rec(root, worker_array, &worker_cnt);
+    unsigned nb_workers;
+    int min_size, max_size;
+
+    get_min_max_sizes(&min_size, &max_size, topology);
+
+    STARPU_ASSERT(min_size <= max_size);
+
+    tree_t tree;
+
+    /* Of course we start from the root */
+    tree.obj = hwloc_get_obj_by_depth(topology->hwtopology, HWLOC_OBJ_SYSTEM, 0); 
+    tree.nb_workers = 0;
+    tree.workers = (int *) malloc(topology->nhwcpus * sizeof(int));
+
+    /* We recursively go from the root to the leaves of the tree to find
+     * subtrees that only have CPUs as leaves. */
+    nb_workers = find_and_assign_combinations_with_hwloc_recursive(&tree, min_size, max_size);
+
+    /* There are still some workers left, since the only possibility is that
+     * the number of workers left is less than the minimum worker size we assign them all */
+    if(nb_workers > 0)
+    {
+	/* find_and_assign_combinations_with_hwloc_recursive shouldn't return
+	 * while there are enough workers to assign regarding the min_size value */
+	STARPU_ASSERT(nb_workers < max_size);
+
+	int ret = starpu_combined_worker_assign_workerid(nb_workers, tree.workers);
+	STARPU_ASSERT(ret >= 0);
+    }
+
+    free(tree.workers);
 }
 }
 
 
-#else
+#else /* STARPU_HAVE_HWLOC */
 
 
-static void find_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
+static void find_and_assign_combinations_without_hwloc(struct starpu_machine_topology_s *topology)
 {
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
 
 
-	/* We put the id of all CPU workers in this array */
-	int cpu_workers[STARPU_NMAXWORKERS];
-	unsigned ncpus = 0;
+    /* We put the id of all CPU workers in this array */
+    int cpu_workers[STARPU_NMAXWORKERS];
+    unsigned ncpus = 0;
 
 
-	unsigned i;
-	for (i = 0; i < topology->nworkers; i++)
-	{
-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-			cpu_workers[ncpus++] = i;
-	}
-	
-	unsigned size;
-	for (size = 2; size <= ncpus; size *= 2)
+    unsigned i;
+    for (i = 0; i < topology->nworkers; i++)
+    {
+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
+	    cpu_workers[ncpus++] = i;
+    }
+
+    unsigned size;
+    for (size = 2; size <= ncpus; size *= 2)
+    {
+	unsigned first_cpu;
+	for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
 	{
 	{
-		unsigned first_cpu;
-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-		{
-			if (first_cpu + size <= ncpus)
-			{
-				int workerids[size];
-
-				for (i = 0; i < size; i++)
-					workerids[i] = cpu_workers[first_cpu + i];
-
-				/* We register this combination */
-				int ret;
-				ret = starpu_combined_worker_assign_workerid(size, workerids); 
-				STARPU_ASSERT(ret >= 0);
-			}
-		}
+	    if (first_cpu + size <= ncpus)
+	    {
+		int workerids[size];
+
+		for (i = 0; i < size; i++)
+		    workerids[i] = cpu_workers[first_cpu + i];
+
+		/* We register this combination */
+		int ret;
+		ret = starpu_combined_worker_assign_workerid(size, workerids); 
+		STARPU_ASSERT(ret >= 0);
+	    }
 	}
 	}
+    }
 }
 }
-#endif
+
+#endif /* STARPU_HAVE_HWLOC */
 
 
 static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 static void combine_all_cpu_workers(struct starpu_machine_topology_s *topology)
 {
 {
-	struct starpu_machine_config_s *config = _starpu_get_machine_config();
+    struct starpu_machine_config_s *config = _starpu_get_machine_config();
 
 
-	int cpu_workers[STARPU_NMAXWORKERS];
-	unsigned ncpus = 0;
+    int cpu_workers[STARPU_NMAXWORKERS];
+    unsigned ncpus = 0;
 
 
-	unsigned i;
-	for (i = 0; i < topology->nworkers; i++)
-	{
-		if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
-			cpu_workers[ncpus++] = i;
-	}
+    unsigned i;
+    for (i = 0; i < topology->nworkers; i++)
+    {
+	if (config->workers[i].perf_arch == STARPU_CPU_DEFAULT)
+	    cpu_workers[ncpus++] = i;
+    }
 
 
-	if (ncpus > 0)
-	{
-		int ret;
-		ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
-		STARPU_ASSERT(ret >= 0);
-	}
+    if (ncpus > 0)
+    {
+	int ret;
+	ret = starpu_combined_worker_assign_workerid(ncpus, cpu_workers);
+	STARPU_ASSERT(ret >= 0);
+    }
 }
 }
 
 
 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
 void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology)
@@ -175,9 +479,9 @@ void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *to
 	combine_all_cpu_workers(topology);
 	combine_all_cpu_workers(topology);
     else {
     else {
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
-	find_combinations_with_hwloc(topology);
+	find_and_assign_combinations_with_hwloc(topology);
 #else
 #else
-	find_combinations_without_hwloc(topology);
+	find_and_assign_combinations_without_hwloc(topology);
 #endif
 #endif
     }
     }
 }
 }

+ 1 - 0
tests/Makefile.am

@@ -121,6 +121,7 @@ noinst_PROGRAMS =				\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_cb_insert		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release		\
 	datawizard/acquire_release2		\
 	datawizard/acquire_release2		\
+	datawizard/copy				\
 	datawizard/data_implicit_deps		\
 	datawizard/data_implicit_deps		\
 	datawizard/data_lookup			\
 	datawizard/data_lookup			\
 	datawizard/scratch			\
 	datawizard/scratch			\

+ 102 - 0
tests/datawizard/copy.c

@@ -0,0 +1,102 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+static unsigned nloops = 1000;
+
+static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static starpu_codelet cpu_codelet =
+{
+        .where = STARPU_CPU,
+        .cpu_func = dummy_func,
+        .model = NULL,
+        .nbuffers = 1
+};
+
+static starpu_codelet gpu_codelet =
+{
+        .where = STARPU_CUDA|STARPU_OPENCL,
+        .cuda_func = dummy_func,
+        .opencl_func = dummy_func,
+        .model = NULL,
+        .nbuffers = 1
+};
+
+
+int main(int argc, char **argv)
+{
+        float foo;
+	starpu_data_handle float_array_handle;
+        int i;
+
+        starpu_init(NULL);
+
+	if (starpu_worker_get_count_by_type(STARPU_CUDA_WORKER) == 0 && starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER) == 0)
+	{
+		FPRINTF(stderr, "This application requires a CUDA or OpenCL Worker\n");
+		starpu_shutdown();
+		return 77;
+	}
+
+        foo = 0.0f;
+	starpu_variable_data_register(&float_array_handle, 0, (uintptr_t)&foo, sizeof(foo));
+
+        for (i = 0; i < nloops; i++)
+        {
+		struct starpu_task *task_cpu, *task_gpu;
+                int ret;
+
+		task_cpu = starpu_task_create();
+		task_gpu = starpu_task_create();
+
+		task_cpu->cl = &cpu_codelet;
+		task_cpu->callback_func = NULL;
+		task_cpu->buffers[0].handle = float_array_handle;
+		task_cpu->buffers[0].mode = STARPU_RW;
+
+		task_gpu->cl = &gpu_codelet;
+		task_gpu->callback_func = NULL;
+		task_gpu->buffers[0].handle = float_array_handle;
+		task_gpu->buffers[0].mode = STARPU_RW;
+
+		ret = starpu_task_submit(task_cpu);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+
+		ret = starpu_task_submit(task_gpu);
+		if (STARPU_UNLIKELY(ret == -ENODEV))
+		{
+			FPRINTF(stderr, "No worker may execute this task\n");
+			exit(0);
+		}
+        }
+
+	starpu_task_wait_for_all();
+	starpu_data_unregister(float_array_handle);
+        starpu_shutdown();
+
+        return 0;
+}

+ 8 - 0
tests/datawizard/dining_philosophers.c

@@ -23,6 +23,8 @@
 starpu_data_handle fork_handles[N];
 starpu_data_handle fork_handles[N];
 unsigned forks[N];
 unsigned forks[N];
 
 
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
 static void eat_kernel(void *descr[], void *arg)
 static void eat_kernel(void *descr[], void *arg)
 {
 {
 }
 }
@@ -78,6 +80,12 @@ int main(int argc, char **argv)
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
 
 
+	FPRINTF(stderr, "waiting done\n");
+	for (f = 0; f < N; f++)
+	{
+		starpu_data_unregister(fork_handles[f]);
+	}
+
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return 0;
 	return 0;