瀏覽代碼

nuveau merge

Andra Hugo 12 年之前
父節點
當前提交
89a45a6db4

+ 2 - 0
ChangeLog

@@ -19,6 +19,8 @@ StarPU 1.1.0 (svn revision xxxx)
 
 New features:
   * OpenGL interoperability support.
+  * Communication statistics for MPI can also be enabled at execution
+    time by defining the environment variable STARPU_COMM_STATS
 
 StarPU 1.0.0 (svn revision 6306)
 ==============================================

+ 5 - 4
configure.ac

@@ -27,10 +27,8 @@ STARPU_MINOR_VERSION="`echo $PACKAGE_VERSION | cut -d . -f 2`"
 AC_SUBST([STARPU_MAJOR_VERSION])
 AC_SUBST([STARPU_MINOR_VERSION])
 AC_SUBST([STARPU_EFFECTIVE_VERSION])
-AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION],
-  [Major version number of StarPU.])
-AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION],
-  [Major version number of StarPU.])
+AC_DEFINE_UNQUOTED([STARPU_MAJOR_VERSION], [$STARPU_MAJOR_VERSION], [Major version number of StarPU.])
+AC_DEFINE_UNQUOTED([STARPU_MINOR_VERSION], [$STARPU_MINOR_VERSION], [Minor version number of StarPU.])
 
 . "$srcdir/STARPU-VERSION"
 AC_SUBST([LIBSTARPU_INTERFACE_CURRENT])
@@ -590,6 +588,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXOPENCLDEVS, [$nmaxopencldev],
 AC_ARG_ENABLE(opencl, [AS_HELP_STRING([--disable-opencl],
 		[do not use OpenCL device(s)])],, [enable_opencl=maybe])
 
+have_valid_opencl=no
 AC_DEFUN([STARPU_CHECK_OPENCL],
 [
     __opencl_dir=$1
@@ -1437,6 +1436,8 @@ if test "$build_socl" = "yes" ; then
        run_socl_check=yes
        AC_SUBST(SOCL_OCL_LIB_OPENCL)
    fi
+else
+   run_socl_check=no
 fi
 ###############################################################################
 #                                                                             #

+ 95 - 10
doc/chapters/basic-api.texi

@@ -7,22 +7,34 @@
 @c See the file starpu.texi for copying conditions.
 
 @menu
-* Initialization and Termination::  Initialization and Termination methods
-* Workers' Properties::         Methods to enumerate workers' properties
-* Data Management::                Methods to manipulate data
+* Versioning::
+* Initialization and Termination::
+* Workers' Properties::
+* Data Management::
 * Data Interfaces::
 * Data Partition::
-* Codelets and Tasks::          Methods to construct tasks
-* Explicit Dependencies::       Explicit Dependencies
-* Implicit Data Dependencies::  Implicit Data Dependencies
+* Codelets and Tasks::
+* Explicit Dependencies::
+* Implicit Data Dependencies::
 * Performance Model API::
-* Profiling API::               Profiling API
-* CUDA extensions::             CUDA extensions
-* OpenCL extensions::           OpenCL extensions
-* Cell extensions::             Cell extensions
+* Profiling API::
+* CUDA extensions::
+* OpenCL extensions::
+* Cell extensions::
 * Miscellaneous helpers::
 @end menu
 
+@node Versioning
+@section Versioning
+
+@defmac STARPU_MAJOR_VERSION
+Define the major version of StarPU
+@end defmac
+
+@defmac STARPU_MINOR_VERSION
+Define the minor version of StarPU
+@end defmac
+
 @node Initialization and Termination
 @section Initialization and Termination
 
@@ -784,6 +796,16 @@ Return a pointer to the variable designated by @var{interface}.
 Return the size of the variable designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_VARIABLE_GET_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the variable designated by @var{interface}, to be
+used on OpenCL. The offset documented below has to be used in addition to this.
+@end defmac
+
+@defmac STARPU_VARIABLE_GET_OFFSET ({void *}@var{interface})
+Return the offset in the variable designated by @var{interface}, to be used
+with the device handle.
+@end defmac
+
 @node Accessing Vector Data Interfaces
 @subsubsection Vector Data Interfaces
 
@@ -1006,14 +1028,36 @@ Return the number of non-zero values in the matrix designated by @var{interface}
 Return a pointer to the non-zero values of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_BCSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the array of non-zero values in the matrix designated
+by @var{interface}. The offset documented below has to be used in addition to 
+this.
+@end defmac
+
 @defmac STARPU_BCSR_GET_COLIND ({void *}@var{interface})
 Return a pointer to the column index of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_BCSR_GET_COLIND_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the column index of the matrix designated by
+@var{interface}. The offset documented below has to be used in addition to
+this.
+@end defmac
+
 @defmac STARPU_BCSR_GET_ROWPTR ({void *}@var{interface})
 Return a pointer to the row pointer array of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_CSR_GET_ROWPTR_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the row pointer array of the matrix designated by
+@var{interface}. The offset documented below has to be used in addition to
+this.
+@end defmac
+
+@defmac STARPU_BCSR_GET_OFFSET ({void *}@var{interface})
+Return the offset in the arrays (coling, rowptr, nzval) of the matrix
+designated by @var{interface}, to be used with the device handles.
+@end defmac
 
 @node Accessing CSR Data Interfaces
 @subsubsection CSR Data Interfaces
@@ -1059,14 +1103,37 @@ Return the size of the row pointer array of the matrix designated by @var{interf
 Return a pointer to the non-zero values of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_CSR_GET_NZVAL_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the array of non-zero values in the matrix designated
+by @var{interface}. The offset documented below has to be used in addition to 
+this.
+@end defmac
+
 @defmac STARPU_CSR_GET_COLIND ({void *}@var{interface})
 Return a pointer to the column index of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_CSR_GET_COLIND_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the column index of the matrix designated by
+@var{interface}. The offset documented below has to be used in addition to
+this.
+@end defmac
+
 @defmac STARPU_CSR_GET_ROWPTR ({void *}@var{interface})
 Return a pointer to the row pointer array of the matrix designated by @var{interface}.
 @end defmac
 
+@defmac STARPU_CSR_GET_ROWPTR_DEV_HANDLE ({void *}@var{interface})
+Return a device handle for the row pointer array of the matrix designated by
+@var{interface}. The offset documented below has to be used in addition to
+this.
+@end defmac
+
+@defmac STARPU_CSR_GET_OFFSET ({void *}@var{interface})
+Return the offset in the arrays (colind, rowptr, nzval) of the matrix
+designated by @var{interface}, to be used with the device handles.
+@end defmac
+
 @defmac STARPU_CSR_GET_FIRSTENTRY ({void *}@var{interface})
 Return the index at which all arrays (the column indexes, the row pointers...)
 of the @var{interface} start.
@@ -1083,13 +1150,31 @@ Return the size of the elements registered into the matrix designated by @var{in
 Return a pointer to the column array of the matrix designated by
 @var{interface}.
 @end defmac
+@defmac STARPU_COO_GET_COLUMNS_DEV_HANDLE({void *}@var{interface})
+Return a device handle for the column array of the matrix designated by
+@var{interface}, to be used on OpenCL. The offset documented below has to be
+used in addition to this.
+@end defmac
 @defmac STARPU_COO_GET_ROWS (interface)
 Return a pointer to the rows array of the matrix designated by @var{interface}.
 @end defmac
+@defmac STARPU_COO_GET_ROWS_DEV_HANDLE({void *}@var{interface})
+Return a device handle for the row array of the matrix designated by
+@var{interface}, to be used on OpenCL. The offset documented below has to be
+used in addition to this.
+@end defmac
 @defmac STARPU_COO_GET_VALUES (interface)
 Return a pointer to the values array of the matrix designated by
 @var{interface}.
 @end defmac
+@defmac STARPU_COO_GET_VALUES_DEV_HANDLE({void *}@var{interface})
+Return a device handle for the value array of the matrix designated by
+@var{interface}, to be used on OpenCL. The offset documented below has to be
+used in addition to this.
+@end defmac
+@defmac STARPU_COO_GET_OFFSET({void *}@var{itnerface})
+Return the offset in the arrays of the COO matrix designated by @var{interface}.
+@end defmac
 @defmac STARPU_COO_GET_NX (interface)
 Return the number of elements on the x-axis of the matrix designated by
 @var{interface}.

+ 7 - 0
doc/chapters/configuration.texi

@@ -176,6 +176,7 @@ Use the @command{mpicc} compiler at @var{path}, for starpumpi
 (@pxref{StarPU MPI support}).
 
 @item --enable-comm-stats
+@anchor{enable-comm-stats}
 Enable communication statistics for starpumpi (@pxref{StarPU MPI
 support}).
 
@@ -435,6 +436,12 @@ THE SOCL test suite is only run when the environment variable
 @code{SOCL_OCL_LIB_OPENCL} is defined. It should contain the location
 of the libOpenCL.so file of the OCL ICD implementation.
 
+@item @code{STARPU_COMM_STATS}
+Communication statistics for starpumpi (@pxref{StarPU MPI support})
+will be enabled when the environment variable @code{STARPU_COMM_STATS}
+is defined. The statistics can also be enabled by configuring StarPU
+with the option @code{--enable-comm-stats} (@pxref{enable-comm-stats}).
+
 @end table
 
 @node Misc

+ 4 - 2
doc/starpu.texi

@@ -47,8 +47,10 @@ Free Documentation License''.
 @contents
 @page
 
+@ifnottex
 @node Top
-@top Preface
+@top StarPU Handbook
+@end ifnottex
 
 This manual documents the usage of StarPU version @value{VERSION}.  It
 was last updated on @value{UPDATED}.
@@ -86,6 +88,7 @@ was last updated on @value{UPDATED}.
 * Using StarPU::                How to run StarPU application
 * Basic Examples::              Basic examples of the use of StarPU
 * Advanced Examples::           Advanced examples of the use of StarPU
+* Benchmarks::                  Benchmarks worth running
 * Performance optimization::    How to optimize performance with StarPU
 * Performance feedback::        Performance debugging tools
 * Tips and Tricks::             Tips and tricks to know about
@@ -93,7 +96,6 @@ was last updated on @value{UPDATED}.
 * StarPU FFT support::          How to perform FFT computations with StarPU
 * C Extensions::                Easier StarPU programming with GCC
 * SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
-* Benchmarks::                  Benchmarks worth running
 * StarPU Basic API::            The Basic API to use StarPU
 * StarPU Advanced API::         Advanced use of StarPU
 * Configuring StarPU::          How to configure StarPU

+ 1 - 1
include/starpu.h

@@ -143,7 +143,7 @@ struct starpu_conf
 	int disable_opencl_asynchronous_copy;
 
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
-	int *cuda_opengl_interoperability;
+	unsigned *cuda_opengl_interoperability;
 	unsigned n_cuda_opengl_interoperability;
 
 	/* A driver that the application will run in one of its own threads. */

+ 2 - 2
include/starpu_cuda.h

@@ -39,14 +39,14 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 #define STARPU_CUDA_REPORT_ERROR(status) \
 	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
-size_t starpu_cuda_get_global_mem_size(int devid);
+size_t starpu_cuda_get_global_mem_size(unsigned devid);
 cudaStream_t starpu_cuda_get_local_stream(void);
 
 const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
 
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
-void starpu_cuda_set_device(int devid);
+void starpu_cuda_set_device(unsigned devid);
 
 #ifdef __cplusplus
 }

+ 24 - 0
include/starpu_data_interfaces.h

@@ -217,10 +217,17 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, uint32_t home_node,
 
 #define STARPU_COO_GET_COLUMNS(interface) \
 	(((struct starpu_coo_interface *)(interface))->columns)
+#define STARPU_COO_GET_COLUMNS_DEV_HANDLE(interface) \
+	(((struct starpu_coo_interface *)(interface))->columns)
 #define STARPU_COO_GET_ROWS(interface) \
 	(((struct starpu_coo_interface *)(interface))->rows)
+#define STARPU_COO_GET_ROWS_DEV_HANDLE(interface) \
+	(((struct starpu_coo_interface *)(interface))->rows)
 #define STARPU_COO_GET_VALUES(interface) \
 	(((struct starpu_coo_interface *)(interface))->values)
+#define STARPU_COO_GET_VALUES_DEV_HANDLE(interface) \
+	(((struct starpu_coo_interface *)(interface))->values)
+#define STARPU_COO_GET_OFFSET 0
 #define STARPU_COO_GET_NX(interface) \
 	(((struct starpu_coo_interface *)(interface))->nx)
 #define STARPU_COO_GET_NY(interface) \
@@ -302,6 +309,9 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 /* helper methods */
 #define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
 #define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
+#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
+	(((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_OFFSET 0
 
 /* void interface. There is no data really associated to that interface, but it
  * may be used as a synchronization mechanism. It also permits to express an
@@ -338,8 +348,15 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
 #define STARPU_CSR_GET_NNZ(interface)	(((struct starpu_csr_interface *)(interface))->nnz)
 #define STARPU_CSR_GET_NROW(interface)	(((struct starpu_csr_interface *)(interface))->nrow)
 #define STARPU_CSR_GET_NZVAL(interface)	(((struct starpu_csr_interface *)(interface))->nzval)
+#define STARPU_CSR_GET_NZVAL_DEV_HANDLE \
+	(((struct starpu_csr_interface *)(interface))->nnz)
 #define STARPU_CSR_GET_COLIND(interface)	(((struct starpu_csr_interface *)(interface))->colind)
+#define STARPU_CSR_GET_COLIND_DEV_HANDLE(interface) \
+	(((struct starpu_csr_interface *)(interface))->colind)
 #define STARPU_CSR_GET_ROWPTR(interface)	(((struct starpu_csr_interface *)(interface))->rowptr)
+#define STARPU_CSR_GET_ROWPTR_DEV_HANDLE \
+	(((struct starpu_csr_interface *)(interface))->rowptr)
+#define STARPU_CSR_GET_OFFSET 0
 #define STARPU_CSR_GET_FIRSTENTRY(interface)	(((struct starpu_csr_interface *)(interface))->firstentry)
 #define STARPU_CSR_GET_ELEMSIZE(interface)	(((struct starpu_csr_interface *)(interface))->elemsize)
 
@@ -370,8 +387,15 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handle, uint32_t home_node,
 
 #define STARPU_BCSR_GET_NNZ(interface)        (((struct starpu_bcsr_interface *)(interface))->nnz)
 #define STARPU_BCSR_GET_NZVAL(interface)      (((struct starpu_bcsr_interface *)(interface))->nzval)
+#define STARPU_BCSR_GET_NZVAL_DEV_HANDLE(interface) \
+	(((struct starpu_bcsr_interface *)(interface))->nnz)
 #define STARPU_BCSR_GET_COLIND(interface)     (((struct starpu_bcsr_interface *)(interface))->colind)
+#define STARPU_BCSR_GET_COLIND_DEV_HANDLE(interface) \
+	(((struct starpu_bcsr_interface *)(interface))->colind)
 #define STARPU_BCSR_GET_ROWPTR(interface)     (((struct starpu_bcsr_interface *)(interface))->rowptr)
+#define STARPU_BCSR_GET_ROWPTR_DEV_HANDLE(interface) \
+	(((struct starpu_bcsr_interface *)(interface))->rowptr)
+#define STARPU_BCSR_GET_OFFSET 0
 uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle);
 uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle);
 uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle);

+ 2 - 2
include/starpu_opencl.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,13 +18,13 @@
 #ifndef __STARPU_OPENCL_H__
 #define __STARPU_OPENCL_H__
 
+#include <starpu_config.h>
 #ifdef STARPU_USE_OPENCL
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include <CL/cl.h>
 #endif
-#include <starpu_config.h>
 #include <assert.h>
 
 #ifdef __cplusplus

+ 1 - 0
mpi/src/Makefile.am

@@ -36,6 +36,7 @@ noinst_HEADERS =					\
 	starpu_mpi_private.h				\
 	starpu_mpi_fxt.h				\
 	starpu_mpi_stats.h				\
+	starpu_mpi_insert_task.h			\
 	starpu_mpi_datatype.h
 
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\

+ 5 - 5
mpi/src/starpu_mpi.c

@@ -22,6 +22,7 @@
 #include <starpu_mpi_private.h>
 #include <starpu_profiling.h>
 #include <starpu_mpi_stats.h>
+#include <starpu_mpi_insert_task.h>
 
 /* TODO find a better way to select the polling method (perhaps during the
  * configuration) */
@@ -781,10 +782,6 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 static
 int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 {
-#ifndef STARPU_MPI_CACHE
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
-#endif
-
 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
 	_STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
@@ -822,6 +819,7 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 
 	_starpu_mpi_add_sync_point_in_fxt();
 	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
+	_starpu_mpi_tables_init(MPI_COMM_WORLD);
 	return 0;
 }
 
@@ -838,10 +836,11 @@ int starpu_mpi_initialize_extended(int *rank, int *world_size)
 int starpu_mpi_shutdown(void)
 {
 	void *value;
-	int rank;
+	int rank, world_size;
 
 	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 
 	/* kill the progression thread */
 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
@@ -861,6 +860,7 @@ int starpu_mpi_shutdown(void)
 
 	_starpu_mpi_comm_amounts_display(rank);
 	_starpu_mpi_comm_amounts_free();
+	_starpu_mpi_tables_free(world_size);
 
 	return 0;
 }

+ 8 - 1
mpi/src/starpu_mpi_fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,6 +22,10 @@
 #include <common/config.h>
 #include <common/fxt.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define FUT_MPI_BARRIER		0x5201
 #define FUT_MPI_ISEND		0x5202
 #define FUT_MPI_IRECV_END	0x5203
@@ -40,6 +44,9 @@
 #define TRACE_MPI_IRECV_END(a, b)	do {} while(0);
 #endif
 
+#ifdef __cplusplus
+}
+#endif
 
 
 #endif // __STARPU_MPI_FXT_H__

+ 82 - 40
mpi/src/starpu_mpi_insert_task.c

@@ -40,23 +40,49 @@ struct _starpu_data_entry **sent_data = NULL;
 struct _starpu_data_entry **received_data = NULL;
 #endif /* STARPU_MPI_CACHE */
 
-static void _starpu_mpi_tables_init()
+void _starpu_mpi_tables_init(MPI_Comm comm)
 {
 #ifdef STARPU_MPI_CACHE
-	if (sent_data == NULL) {
-		int nb_nodes;
-		int i;
+	int nb_nodes;
+	int i;
 
-		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
-		_STARPU_MPI_DEBUG("Initialising htable for cache\n");
-		sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
-		received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
-		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
-	}
+	MPI_Comm_size(comm, &nb_nodes);
+	_STARPU_MPI_DEBUG("Initialising htable for cache\n");
+	sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
+	received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
+	for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
+#else
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --disable-mpi-cache\n");
 #endif /* STARPU_MPI_CACHE */
 }
 
+void _starpu_mpi_tables_free(int world_size)
+{
+#ifdef STARPU_MPI_CACHE
+	int i;
+
+	_STARPU_MPI_DEBUG("Clearing htable for cache\n");
+
+	for(i=0 ; i<world_size ; i++)
+	{
+		struct _starpu_data_entry *entry, *tmp;
+		HASH_ITER(hh, sent_data[i], entry, tmp)
+		{
+			HASH_DEL(sent_data[i], entry);
+			free(entry);
+		}
+		HASH_ITER(hh, received_data[i], entry, tmp)
+		{
+			HASH_DEL(received_data[i], entry);
+			free(entry);
+		}
+	}
+	free(sent_data);
+	free(received_data);
+#endif
+}
+
 static
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
 {
@@ -107,6 +133,47 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access
 }
 
 static
+void *_starpu_mpi_already_received(starpu_data_handle_t data, int mpi_rank)
+{
+#ifdef STARPU_MPI_CACHE
+	struct _starpu_data_entry *already_received;
+	HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
+	if (already_received == NULL) {
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(received_data[mpi_rank], data, entry);
+	}
+	else {
+		_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
+	}
+	return already_received;
+#else
+	return NULL;
+#endif
+}
+
+static
+void *_starpu_mpi_already_sent(starpu_data_handle_t data, int dest)
+{
+#ifdef STARPU_MPI_CACHE
+	struct _starpu_data_entry *already_sent;
+	HASH_FIND_PTR(sent_data[dest], &data, already_sent);
+	if (already_sent == NULL) {
+		struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
+		entry->data = data;
+		HASH_ADD_PTR(sent_data[dest], data, entry);
+		_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
+	}
+	else {
+		_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
+	}
+	return already_sent;
+#else
+	return NULL;
+#endif
+}
+
+static
 void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
 {
 	if (data && mode & STARPU_R) {
@@ -123,19 +190,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 		/* The task needs to read this data */
 		if (do_execute && mpi_rank != me && mpi_rank != -1) {
 			/* I will have to execute but I don't have the data, receive */
-#ifdef STARPU_MPI_CACHE
-			struct _starpu_data_entry *already_received;
-			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
-			if (already_received == NULL) {
-				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-				entry->data = data;
-				HASH_ADD_PTR(received_data[mpi_rank], data, entry);
-			}
-			else {
-				_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
-			}
-			if (!already_received)
-#endif
+			void *already_received = _starpu_mpi_already_received(data, mpi_rank);
+			if (already_received == NULL)
 			{
 				_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
 				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
@@ -143,20 +199,8 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
 		}
 		if (!do_execute && mpi_rank == me) {
 			/* Somebody else will execute it, and I have the data, send it. */
-#ifdef STARPU_MPI_CACHE
-			struct _starpu_data_entry *already_sent;
-			HASH_FIND_PTR(sent_data[dest], &data, already_sent);
-			if (already_sent == NULL) {
-				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
-				entry->data = data;
-				HASH_ADD_PTR(sent_data[dest], data, entry);
-				_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
-			}
-			else {
-				_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
-			}
-			if (!already_sent)
-#endif
+			void *already_sent = _starpu_mpi_already_sent(data, dest);
+			if (already_sent == NULL)
 			{
 				_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
 				starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
@@ -251,8 +295,6 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
 	size_on_nodes = (size_t *)calloc(1, nb_nodes * sizeof(size_t));
 
-	_starpu_mpi_tables_init();
-
 	/* Get the number of buffers and the size of the arguments */
 	va_start(varg_list, codelet);
 	arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);

+ 33 - 0
mpi/src/starpu_mpi_insert_task.h

@@ -0,0 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_INSERT_TASK_H__
+#define __STARPU_MPI_INSERT_TASK_H__
+
+#include <mpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void _starpu_mpi_tables_init(MPI_Comm comm);
+void _starpu_mpi_tables_free(int world_size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_INSERT_TASK_H__

+ 8 - 0
mpi/src/starpu_mpi_private.h

@@ -26,6 +26,10 @@
 #include <common/utils.h>
 #include <pthread.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 //#define STARPU_MPI_VERBOSE	1
 
 #ifdef STARPU_MPI_VERBOSE
@@ -96,4 +100,8 @@ LIST_TYPE(_starpu_mpi_req,
 	void (*callback)(void *);
 );
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif // __STARPU_MPI_PRIVATE_H__

+ 19 - 14
mpi/src/starpu_mpi_stats.c

@@ -21,53 +21,59 @@
 #include <starpu_mpi_private.h>
 
 /* measure the amount of data transfers between each pair of MPI nodes */
-#ifdef STARPU_COMM_STATS
 static size_t *comm_amount;
 static int world_size;
-#endif /* STARPU_COMM_STATS */
+static int stats_enabled=0;
 
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 {
 #ifdef STARPU_COMM_STATS
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats, which slows down a bit\n");
+	stats_enabled = 1;
+#else
+	stats_enabled = starpu_get_env_number("STARPU_COMM_STATS");
+	if (stats_enabled == -1)
+	{
+		stats_enabled = 0;
+	}
+#endif /* STARPU_COMM_STATS */
+
+	if (stats_enabled == 0) return;
+
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats or is executed with STARPU_COMM_STATS=1, which slows down a bit\n");
 
 	MPI_Comm_size(comm, &world_size);
 	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
 
 	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
-#endif /* STARPU_COMM_STATS */
 }
 
 void _starpu_mpi_comm_amounts_free()
 {
-#ifdef STARPU_COMM_STATS
+	if (stats_enabled == 0) return;
 	free(comm_amount);
-#endif /* STARPU_COMM_STATS */
 }
 
-void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)),
-				  unsigned dst  __attribute__ ((unused)),
-				  MPI_Datatype datatype  __attribute__ ((unused)),
-				  int count __attribute__ ((unused)))
+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count)
 {
-#ifdef STARPU_COMM_STATS
 	int src, size;
 
+	if (stats_enabled == 0) return;
+
 	MPI_Comm_rank(comm, &src);
 	MPI_Type_size(datatype, &size);
 
 	_STARPU_MPI_DEBUG("[%d] adding %d to %d\n", src, count*size, dst);
 
 	comm_amount[dst] += count*size;
-#endif /* STARPU_COMM_STATS */
 }
 
 void _starpu_mpi_comm_amounts_display(int node)
 {
-#ifdef STARPU_COMM_STATS
 	unsigned dst;
 	size_t sum = 0;
 
+	if (stats_enabled == 0) return;
+
 	for (dst = 0; dst < world_size; dst++)
 	{
 		sum += comm_amount[dst];
@@ -83,6 +89,5 @@ void _starpu_mpi_comm_amounts_display(int node)
 				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
 		}
 	}
-#endif /* STARPU_COMM_STATS */
 }
 

+ 12 - 0
mpi/src/starpu_mpi_stats.h

@@ -14,11 +14,23 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#ifndef __STARPU_MPI_STATS_H__
+#define __STARPU_MPI_STATS_H__
+
 #include <stdlib.h>
 #include <mpi.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
 void _starpu_mpi_comm_amounts_free();
 void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype, int count);
 void _starpu_mpi_comm_amounts_display(int node);
 
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_STATS_H__

+ 11 - 11
src/drivers/cuda/driver_cuda.c

@@ -58,7 +58,7 @@ _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
 	config->topology.nhwcudagpus = cnt;
 }
 
-static void limit_gpu_mem_if_needed(int devid)
+static void limit_gpu_mem_if_needed(unsigned devid)
 {
 	cudaError_t cures;
 	int limit = starpu_get_env_number("STARPU_LIMIT_GPU_MEM");
@@ -77,7 +77,7 @@ static void limit_gpu_mem_if_needed(int devid)
 
 	props[devid].totalGlobalMem -= to_waste;
 
-	_STARPU_DEBUG("CUDA device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
+	_STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
 			devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
 			(size_t)(totalGlobalMem - to_waste)/(1024*1024));
 
@@ -87,7 +87,7 @@ static void limit_gpu_mem_if_needed(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-static void unlimit_gpu_mem_if_needed(int devid)
+static void unlimit_gpu_mem_if_needed(unsigned devid)
 {
 	cudaError_t cures;
 
@@ -101,7 +101,7 @@ static void unlimit_gpu_mem_if_needed(int devid)
 	}
 }
 
-size_t starpu_cuda_get_global_mem_size(int devid)
+size_t starpu_cuda_get_global_mem_size(unsigned devid)
 {
 	return (size_t)props[devid].totalGlobalMem;
 }
@@ -127,7 +127,7 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 	return &props[devid];
 }
 
-void starpu_cuda_set_device(int devid)
+void starpu_cuda_set_device(unsigned devid)
 {
 	cudaError_t cures;
 	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
@@ -162,7 +162,7 @@ done:
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-static void init_context(int devid)
+static void init_context(unsigned devid)
 {
 	cudaError_t cures;
 	int workerid;
@@ -221,7 +221,7 @@ static void init_context(int devid)
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-static void deinit_context(int workerid, int devid)
+static void deinit_context(int workerid, unsigned devid)
 {
 	cudaError_t cures;
 
@@ -333,7 +333,7 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 	struct _starpu_worker* args = _starpu_get_worker_from_driver(d);
 	STARPU_ASSERT(args);
 
-	int devid = args->devid;
+	unsigned devid = args->devid;
 
 	_starpu_worker_init(args, _STARPU_FUT_CUDA_KEY);
 
@@ -357,10 +357,10 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 #endif
 		snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB %02x:%02x.0)", args->devid, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
 #else
-	snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB)", args->devid, devname, size);
+	snprintf(args->name, sizeof(args->name), "CUDA %u (%s %.1f GiB)", args->devid, devname, size);
 #endif
-	snprintf(args->short_name, sizeof(args->short_name), "CUDA %d", args->devid);
-	_STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
+	snprintf(args->short_name, sizeof(args->short_name), "CUDA %u", args->devid);
+	_STARPU_DEBUG("cuda (%s) dev id %u thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
 
 	_STARPU_TRACE_WORKER_INIT_END
 

+ 4 - 1
src/drivers/opencl/driver_opencl_utils.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -365,6 +365,9 @@ int _starpu_opencl_compile_or_load_opencl_from_file(const char *source_file_name
 
 	starpu_opencl_load_program_source(source_file_name, located_file_name, located_dir_name, opencl_program_source);
 
+	if (!build_options)
+		build_options = "";
+
 	if (!strcmp(located_dir_name, ""))
 		strcpy(new_build_options, build_options);
 	else if (build_options)

+ 2 - 2
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
@@ -152,7 +152,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 	int workerid = starpu_worker_get_id();
 	struct _starpu_fifo_taskq *fifo = dt->queue_array[workerid];
 
-	task = _starpu_fifo_pop_task(fifo, workerid);
+	task = _starpu_fifo_pop_local_task(fifo);
 	if (task)
 	{
 		double model = task->predicted;

+ 18 - 1
src/sched_policies/fifo_queues.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
@@ -94,6 +94,23 @@ struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo_queue,
 	return NULL;
 }
 
+/* This is the same as _starpu_fifo_pop_task, but without checking that the
+ * worker will be able to execute this task. This is useful when the scheduler
+ * has already checked it. */
+struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo_queue)
+{
+	struct starpu_task *task = NULL;
+
+	if (!starpu_task_list_empty(&fifo_queue->taskq))
+	{
+		task = starpu_task_list_pop_back(&fifo_queue->taskq);
+		fifo_queue->ntasks--;
+		_STARPU_TRACE_JOB_POP(task, 0);
+	}
+
+	return task;
+}
+
 /* pop every task that can be executed on the calling driver */
 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo_queue, pthread_mutex_t *sched_mutex, int workerid)
 {

+ 2 - 1
src/sched_policies/fifo_queues.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -47,6 +47,7 @@ int _starpu_fifo_empty(struct _starpu_fifo_taskq *fifo);
 int _starpu_fifo_push_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, pthread_cond_t *sched_cond, struct starpu_task *task);
 
 struct starpu_task *_starpu_fifo_pop_task(struct _starpu_fifo_taskq *fifo, int workerid);
+struct starpu_task *_starpu_fifo_pop_local_task(struct _starpu_fifo_taskq *fifo);
 struct starpu_task *_starpu_fifo_pop_every_task(struct _starpu_fifo_taskq *fifo, pthread_mutex_t *sched_mutex, int workerid);
 
 #endif // __FIFO_QUEUES_H__

+ 5 - 0
tests/main/driver_api/init_run_deinit.c

@@ -52,6 +52,8 @@ run(struct starpu_task *task, struct starpu_driver *d)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_driver_run_once(d);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_driver_run_once");
+	ret = starpu_task_wait(task);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait");
 }
 
 static void
@@ -107,6 +109,7 @@ test_cpu(void)
 		cl.where = STARPU_CPU;
 		task->cl = &cl;
 		task->cl_arg = &var;
+		task->detach = 0;
 
 		run(task, &d);
 	}
@@ -166,6 +169,7 @@ test_cuda(void)
 		cl.where = STARPU_CUDA;
 		task->cl = &cl;
 		task->cl_arg = &var;
+		task->detach = 0;
 
 		run(task, &d);
 	}
@@ -244,6 +248,7 @@ test_opencl(void)
 		cl.where = STARPU_OPENCL;
 		task->cl = &cl;
 		task->cl_arg = &var;
+		task->detach = 0;
 
 		run(task, &d);
 	}

+ 1 - 1
tests/sched_policies/data_locality.c

@@ -29,7 +29,7 @@ cost_function(struct starpu_task *task, unsigned nimpl)
 {
 	(void) task;
 	(void) nimpl;
-	return 0.0;
+	return 1.0;
 }
 
 static struct starpu_perfmodel model =

+ 3 - 3
tools/gdbinit

@@ -2,7 +2,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010-2012  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -110,10 +110,10 @@ end
 define starpu-workers
   set language c
   set $num=0
-  printf "[Id] Name       Arch Mask Devid Bindid Workerid Isrunning Isinitialized\n"
+  printf "[Id] Name                                     Arch Mask Devid Bindid Workerid Isrunning Isinitialized\n"
   while $num<config->topology->nworkers
     set $worker=&config->workers[$num]
-    printf "[%2d] %-10s %-4d %-4d %-5d %-6d %-8d %-9d %-13d\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
+    printf "[%2d] %-40s %-4d %-4d %-5d %-6d %-8d %-9d %-13d\n", $num, $worker->name, $worker->arch, $worker->worker_mask, \
           $worker->devid, $worker->bindid, $worker->workerid, $worker->worker_is_running, $worker->worker_is_initialized
     set $num = $num + 1
   end

+ 3 - 3
tools/starpu_workers_activity.in

@@ -2,7 +2,7 @@
 
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # 
-# Copyright (C) 2010  Université de Bordeaux 1
+# Copyright (C) 2010, 2012  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 # 
 # StarPU is free software; you can redistribute it and/or modify
@@ -37,12 +37,12 @@ usage()
     exit 0
 }
 
-if [ "$1" == "-v" ] || [ "$1" == "--version" ] ; then
+if [ "$1" = "-v" ] || [ "$1" = "--version" ] ; then
     echo "$PROGNAME (@PACKAGE_NAME@) @PACKAGE_VERSION@"
     exit 0
 fi
 
-if [ "$1" == "-h" ] || [ "$1" == "--help" ] || [ "$1" == "" ] ; then
+if [ "$1" = "-h" ] || [ "$1" = "--help" ] || [ "$1" = "" ] ; then
     usage
 fi