Browse Source

merge m4+socl+mpi+include

Andra Hugo 14 years ago
parent
commit
a9dbfd117e
92 changed files with 4090 additions and 2565 deletions
  1. 14 5
      include/starpu.h
  2. 13 2
      include/starpu_bound.h
  3. 23 0
      include/starpu_config.h.in
  4. 11 6
      include/starpu_cuda.h
  5. 62 48
      include/starpu_data.h
  6. 18 15
      include/starpu_data_filters.h
  7. 187 120
      include/starpu_data_interfaces.h
  8. 61 0
      include/starpu_deprecated_api.h
  9. 3 3
      include/starpu_expert.h
  10. 69 0
      include/starpu_fxt.h
  11. 39 0
      include/starpu_hash.h
  12. 48 13
      include/starpu_opencl.h
  13. 108 30
      include/starpu_perfmodel.h
  14. 11 8
      include/starpu_profiling.h
  15. 28 15
      include/starpu_scheduler.h
  16. 80 51
      include/starpu_task.h
  17. 95 51
      include/starpu_task_bundle.h
  18. 23 4
      include/starpu_task_list.h
  19. 72 124
      include/starpu_top.h
  20. 69 70
      include/starpu_util.h
  21. 95 0
      m4/acinclude.m4
  22. 33 3
      m4/gcc.m4
  23. 0 214
      magma_tests/time_zpotrf_tile.c
  24. 0 747
      magma_tests/timing.c
  25. 64 34
      mpi/Makefile.am
  26. 62 36
      mpi/examples/cholesky/mpi_cholesky.c
  27. 14 7
      mpi/examples/cholesky/mpi_cholesky.h
  28. 34 22
      mpi/examples/cholesky/mpi_cholesky_distributed.c
  29. 21 17
      mpi/examples/cholesky/mpi_cholesky_kernels.c
  30. 7 9
      mpi/examples/cholesky/mpi_cholesky_models.c
  31. 3 3
      mpi/examples/cholesky/mpi_cholesky_models.h
  32. 27 27
      mpi/examples/mpi_lu/plu_example.c
  33. 41 49
      mpi/examples/mpi_lu/pxlu.c
  34. 7 7
      mpi/examples/mpi_lu/pxlu.h
  35. 21 17
      mpi/examples/mpi_lu/pxlu_kernels.c
  36. 4 4
      mpi/examples/mpi_lu/pxlu_kernels.h
  37. 155 0
      mpi/examples/reduction/mpi_reduction.c
  38. 66 0
      mpi/examples/reduction/mpi_reduction_kernels.c
  39. 11 9
      mpi/examples/scatter_gather/mpi_scatter_gather.c
  40. 53 35
      mpi/examples/stencil/stencil5.c
  41. 29 0
      mpi/libstarpumpi.pc.in
  42. 125 125
      mpi/starpu_mpi.c
  43. 24 19
      mpi/starpu_mpi.h
  44. 2 2
      mpi/starpu_mpi_collective.c
  45. 16 11
      mpi/starpu_mpi_datatype.c
  46. 11 3
      mpi/starpu_mpi_datatype.h
  47. 4 4
      mpi/starpu_mpi_helper.c
  48. 150 124
      mpi/starpu_mpi_insert_task.c
  49. 94 0
      mpi/starpu_mpi_insert_task_cache.c
  50. 26 0
      mpi/starpu_mpi_insert_task_cache.h
  51. 19 19
      mpi/starpu_mpi_private.h
  52. 29 0
      mpi/starpumpi-1.0.pc.in
  53. 29 19
      mpi/tests/block_interface.c
  54. 30 19
      mpi/tests/block_interface_pinned.c
  55. 22 0
      mpi/tests/helper.h
  56. 55 34
      mpi/tests/insert_task.c
  57. 61 37
      mpi/tests/insert_task_block.c
  58. 59 36
      mpi/tests/insert_task_cache.c
  59. 107 69
      mpi/tests/insert_task_owner.c
  60. 33 22
      mpi/tests/insert_task_owner2.c
  61. 23 14
      mpi/tests/insert_task_owner_data.c
  62. 14 11
      mpi/tests/mpi_detached_tag.c
  63. 14 11
      mpi/tests/mpi_irecv.c
  64. 20 17
      mpi/tests/mpi_irecv_detached.c
  65. 14 11
      mpi/tests/mpi_isend.c
  66. 20 17
      mpi/tests/mpi_isend_detached.c
  67. 18 13
      mpi/tests/mpi_test.c
  68. 26 16
      mpi/tests/multiple_send.c
  69. 14 12
      mpi/tests/pingpong.c
  70. 25 22
      mpi/tests/ring.c
  71. 22 20
      mpi/tests/ring_async.c
  72. 22 17
      mpi/tests/ring_async_implicit.c
  73. 10 2
      socl/Makefile.am
  74. 67 0
      socl/examples/Makefile.am
  75. 211 0
      socl/examples/basic/basic.c
  76. 302 0
      socl/examples/clinfo/clinfo.c
  77. 507 0
      socl/examples/mandelbrot/mandelbrot.c
  78. 29 0
      socl/socl-1.0.pc.in
  79. 24 7
      socl/src/Makefile.am
  80. 3 3
      socl/src/cl_enqueuecopybuffer.c
  81. 1 1
      socl/src/cl_enqueuemapbuffer.c
  82. 2 2
      socl/src/cl_enqueuendrangekernel.c
  83. 3 3
      socl/src/cl_enqueuereadbuffer.c
  84. 3 3
      socl/src/cl_enqueuewritebuffer.c
  85. 0 1
      socl/src/cl_finish.c
  86. 2 2
      socl/src/cl_getkernelworkgroupinfo.c
  87. 3 3
      socl/src/cl_getplatformids.c
  88. 3 3
      socl/src/command.c
  89. 1 1
      socl/src/command.h
  90. 6 1
      socl/src/init.c
  91. 1 1
      socl/src/socl.h
  92. 3 3
      socl/src/task.c

+ 14 - 5
include/starpu.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,6 +37,8 @@ typedef unsigned long long uint64_t;
 
 #include <starpu_util.h>
 #include <starpu_data.h>
+#include <starpu_data_interfaces.h>
+#include <starpu_data_filters.h>
 #include <starpu_perfmodel.h>
 #include <starpu_task.h>
 #include <starpu_task_list.h>
@@ -44,13 +46,15 @@ typedef unsigned long long uint64_t;
 #include <starpu_expert.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-struct starpu_conf {
+struct starpu_conf
+{
 	/* which scheduling policy should be used ? (NULL for default) */
 	const char *sched_policy_name;
-	struct starpu_sched_policy_s *sched_policy;
+	struct starpu_sched_policy *sched_policy;
 
 	/* number of CPU workers (-1 for default) */
 	int ncpus;
@@ -108,7 +112,8 @@ int starpu_combined_worker_get_id(void);
 int starpu_combined_worker_get_size(void);
 int starpu_combined_worker_get_rank(void);
 
-enum starpu_archtype {
+enum starpu_archtype
+{
 	STARPU_CPU_WORKER, /* CPU core */
 	STARPU_CUDA_WORKER, /* NVIDIA CUDA device */
 	STARPU_OPENCL_WORKER, /* OpenCL CUDA device */
@@ -156,4 +161,8 @@ int starpu_worker_get_devid(int id);
 }
 #endif
 
+#if defined(STARPU_USE_DEPRECATED_API)
+#include "starpu_deprecated_api.h"
+#endif /* STARPU_USE_DEPRECATED_API */
+
 #endif /* __STARPU_H__ */

+ 13 - 2
include/starpu_bound.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,6 +22,13 @@
 #ifndef __STARPU_BOUND_H__
 #define __STARPU_BOUND_H__
 
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 /* Start recording tasks (resets stats).  `deps' tells whether dependencies
  * should be recorded too (this is quite expensive).  */
 void starpu_bound_start(int deps, int prio);
@@ -31,7 +38,7 @@ void starpu_bound_stop(void);
 /* Print the DAG that was recorded */
 void starpu_bound_print_dot(FILE *output);
 
-/* Get theoretical upper bound (needs glpk support) */
+/* Get theoretical upper bound (in ms) (needs glpk support) */
 void starpu_bound_compute(double *res, double *integer_res, int integer);
 
 /* Emit Linear Programming system on output for the recorded tasks in lp format */
@@ -43,4 +50,8 @@ void starpu_bound_print_mps(FILE *output);
 /* Emit statistics of actual execution vs theoretical upper bound */
 void starpu_bound_print(FILE *output, int integer);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __STARPU_BOUND_H__ */

+ 23 - 0
include/starpu_config.h.in

@@ -1,11 +1,33 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
 #ifndef __STARPU_CONFIG_PUBLIC_H__
 #define __STARPU_CONFIG_PUBLIC_H__
 
+#undef STARPU_MAJOR_VERSION
+#undef STARPU_MINOR_VERSION
+
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_GORDON
 
+#undef STARPU_HAVE_ICC
+
 #undef STARPU_USE_MPI
 
 #undef STARPU_ATLAS
@@ -41,6 +63,7 @@
 
 #undef STARPU_HAVE_CURAND
 
+#undef STARPU_MAXNODES
 #undef STARPU_NMAXBUFS
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCUDADEVS

+ 11 - 6
include/starpu_cuda.h

@@ -18,27 +18,32 @@
 #ifndef __STARPU_CUDA_H__
 #define __STARPU_CUDA_H__
 
+#include <starpu_config.h>
+
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <cublas.h>
-#include <starpu_config.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-void starpu_cublas_report_error(const char *func, cublasStatus status);
+void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status);
 #define STARPU_CUBLAS_REPORT_ERROR(status) \
-	starpu_cublas_report_error(__starpu_func__, status)
+	starpu_cublas_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
-void starpu_cuda_report_error(const char *func, cudaError_t status);
+void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status);
 #define STARPU_CUDA_REPORT_ERROR(status) \
-	starpu_cuda_report_error(__starpu_func__, status)
+	starpu_cuda_report_error(__starpu_func__, __FILE__, __LINE__, status)
 
+size_t starpu_cuda_get_global_mem_size(int devid);
 cudaStream_t starpu_cuda_get_local_stream(void);
 
+const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid);
+
 #ifdef __cplusplus
 }
 #endif

+ 62 - 48
include/starpu_data.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,58 +19,61 @@
 #define __STARPU_DATA_H__
 
 #include <starpu.h>
-#include <starpu_config.h>
-
-struct starpu_data_state_t;
-typedef struct starpu_data_state_t * starpu_data_handle;
-
-#include <starpu_data_interfaces.h>
-#include <starpu_data_filters.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-#define STARPU_R	(1<<0)
-#define STARPU_W	(1<<1)
-#define STARPU_RW	(STARPU_R|STARPU_W)
-#define STARPU_SCRATCH	(1<<2)
-#define STARPU_REDUX	(1<<3)
-typedef uint32_t starpu_access_mode;
+struct _starpu_data_state;
+typedef struct _starpu_data_state* starpu_data_handle_t;
+
+enum starpu_access_mode
+{
+	STARPU_NONE=0,
+	STARPU_R=(1<<0),
+	STARPU_W=(1<<1),
+	STARPU_RW=(STARPU_R|STARPU_W),
+	STARPU_SCRATCH=(1<<2),
+	STARPU_REDUX=(1<<3)
+};
 
-typedef struct starpu_buffer_descr_t {
-	starpu_data_handle handle;
-	starpu_access_mode mode;
-} starpu_buffer_descr;
+struct starpu_buffer_descr
+{
+	starpu_data_handle_t handle;
+	enum starpu_access_mode mode;
+};
 
-struct starpu_data_interface_ops_t;
+struct starpu_data_interface_ops;
 
 /* Destroy the data handle, in case we don't need to update the value of the
  * data in the home node, we can use starpu_data_unregister_no_coherency
  * instead. */
-void starpu_data_unregister(starpu_data_handle handle);
-void starpu_data_unregister_no_coherency(starpu_data_handle handle);
+void starpu_data_unregister(starpu_data_handle_t handle);
+void starpu_data_unregister_no_coherency(starpu_data_handle_t handle);
 
 /* Destroy all data replicates. After data invalidation, the first access to
  * the handle must be performed in write-only mode. */
-void starpu_data_invalidate(starpu_data_handle);
+void starpu_data_invalidate(starpu_data_handle_t handle);
 
-void starpu_data_advise_as_important(starpu_data_handle handle, unsigned is_important);
+void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
 
-int starpu_data_acquire(starpu_data_handle handle, starpu_access_mode mode);
-int starpu_data_acquire_cb(starpu_data_handle handle,
-			starpu_access_mode mode, void (*callback)(void *), void *arg);
+int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode);
+int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_access_mode mode, void (*callback)(void *), void *arg);
 #ifdef __GCC__
-#  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do { \
-	void callback(void *arg) { \
-		code; \
-		starpu_data_release(handle); \
-	} \
-	starpu_data_acquire_cb(handle, mode, callback, NULL); \
-} while(0)
+#  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
+	{ \						\
+		void callback(void *arg)		\
+		{					\
+			code;				\
+			starpu_data_release(handle);  	\
+		}			      		\
+		starpu_data_acquire_cb(handle, mode, callback, NULL);	\
+	}						\
+	while(0)
 #endif
 
-void starpu_data_release(starpu_data_handle handle);
+void starpu_data_release(starpu_data_handle_t handle);
 
 int starpu_malloc(void **A, size_t dim);
 int starpu_free(void *A);
@@ -80,36 +83,47 @@ int starpu_free(void *A);
 #define starpu_data_malloc_pinned_if_possible	starpu_malloc
 #define starpu_data_free_pinned_if_possible	starpu_free
 
-int starpu_data_request_allocation(starpu_data_handle handle, uint32_t node);
+int starpu_data_request_allocation(starpu_data_handle_t handle, uint32_t node);
 
-int starpu_data_prefetch_on_node(starpu_data_handle handle, unsigned node, unsigned async);
+int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async);
+
+
+enum starpu_node_kind
+{
+	STARPU_UNUSED     = 0x00,
+	STARPU_CPU_RAM    = 0x01,
+	STARPU_CUDA_RAM   = 0x02,
+	STARPU_OPENCL_RAM = 0x03,
+	STARPU_SPU_LS     = 0x04
+};
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
+unsigned starpu_memory_nodes_get_count(void);
+enum starpu_node_kind starpu_node_get_kind(uint32_t node);
+
 
 /* It is possible to associate a mask to a piece of data (and its children) so
  * that when it is modified, it is automatically transfered into those memory
  * node. For instance a (1<<0) write-through mask means that the CUDA workers will
  * commit their changes in main memory (node 0). */
-void starpu_data_set_wt_mask(starpu_data_handle handle, uint32_t wt_mask);
+void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);
 
-void starpu_data_set_sequential_consistency_flag(starpu_data_handle handle, unsigned flag);
+void starpu_data_set_sequential_consistency_flag(starpu_data_handle_t handle, unsigned flag);
 unsigned starpu_data_get_default_sequential_consistency_flag(void);
 void starpu_data_set_default_sequential_consistency_flag(unsigned flag);
 
 /* Query the status of the handle on the specified memory node. */
-void starpu_data_query_status(starpu_data_handle handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested);
+void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested);
 
-struct starpu_codelet_t;
+struct starpu_codelet;
 
-void starpu_data_set_reduction_methods(starpu_data_handle handle,
-					struct starpu_codelet_t *redux_cl,
-					struct starpu_codelet_t *init_cl);
+void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 
-int starpu_data_set_rank(starpu_data_handle handle, int rank);
-int starpu_data_get_rank(starpu_data_handle handle);
+int starpu_data_set_rank(starpu_data_handle_t handle, int rank);
+int starpu_data_get_rank(starpu_data_handle_t handle);
 
-int starpu_data_set_tag(starpu_data_handle handle, int tag);
-int starpu_data_get_tag(starpu_data_handle handle);
+int starpu_data_set_tag(starpu_data_handle_t handle, int tag);
+int starpu_data_get_tag(starpu_data_handle_t handle);
 
 #ifdef __cplusplus
 }

+ 18 - 15
include/starpu_data_filters.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,41 +19,44 @@
 #ifndef __STARPU_DATA_FILTERS_H__
 #define __STARPU_DATA_FILTERS_H__
 
-#include <stdarg.h>
-
 #include <starpu.h>
-#include <starpu_config.h>
+#include <stdarg.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-struct starpu_data_interface_ops_t;
+struct starpu_data_interface_ops;
 
-struct starpu_data_filter {
+struct starpu_data_filter
+{
 	void (*filter_func)(void *father_interface, void *child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts);
         unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle initial_handle);
         struct starpu_data_interface_ops_t *(*get_child_ops)(struct starpu_data_filter *, unsigned id);
         unsigned filter_arg;
         unsigned nchildren;
+        unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle_t initial_handle);
+        struct starpu_data_interface_ops *(*get_child_ops)(struct starpu_data_filter *, unsigned id);
+        unsigned filter_arg;
         void *filter_arg_ptr;
 };
 
-void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f);
-void starpu_data_unpartition(starpu_data_handle root_data, uint32_t gathering_node);
+void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f);
+void starpu_data_unpartition(starpu_data_handle_t root_data, uint32_t gathering_node);
 
-int starpu_data_get_nb_children(starpu_data_handle handle);
-starpu_data_handle starpu_data_get_child(starpu_data_handle handle, unsigned i);
+int starpu_data_get_nb_children(starpu_data_handle_t handle);
+starpu_data_handle_t starpu_data_get_child(starpu_data_handle_t handle, unsigned i);
 
 /* unsigned list */
-starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_data, unsigned depth, ... );
+starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_data, unsigned depth, ... );
 /* Same, but using va_list */
-starpu_data_handle starpu_data_vget_sub_data(starpu_data_handle root_data, unsigned depth, va_list pa );
+starpu_data_handle_t starpu_data_vget_sub_data(starpu_data_handle_t root_data, unsigned depth, va_list pa);
 
 /* struct starpu_data_filter * list */
-void starpu_data_map_filters(starpu_data_handle root_data, unsigned nfilters, ...);
+void starpu_data_map_filters(starpu_data_handle_t root_data, unsigned nfilters, ...);
 /* Same, but using va_list */
-void starpu_data_vmap_filters(starpu_data_handle root_data, unsigned nfilters, va_list pa);
+void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters, va_list pa);
 
 /* a few examples of filters */
 

+ 187 - 120
include/starpu_data_interfaces.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Institut National de Recherche en Informatique et Automatique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011-2012  Institut National de Recherche en Informatique et Automatique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,8 +20,6 @@
 #define __STARPU_DATA_INTERFACES_H__
 
 #include <starpu.h>
-#include <starpu_data.h>
-#include <starpu_util.h>
 
 #ifdef STARPU_USE_GORDON
 /* to get the gordon_strideSize_t data structure from gordon */
@@ -38,14 +36,16 @@ typedef void *cudaStream_t;
 #endif
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 /* The following structures are used to describe data interfaces */
 
 /* This structure contains the different methods to transfer data between the
  * different types of memory nodes */
-struct starpu_data_copy_methods {
+struct starpu_data_copy_methods
+{
 	/* src type is ram */
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
@@ -87,9 +87,23 @@ struct starpu_data_copy_methods {
 #endif
 };
 
-struct starpu_data_interface_ops_t {
+enum starpu_data_interface_id
+{
+	STARPU_MATRIX_INTERFACE_ID=0,
+	STARPU_BLOCK_INTERFACE_ID=1,
+	STARPU_VECTOR_INTERFACE_ID=2,
+	STARPU_CSR_INTERFACE_ID=3,
+	STARPU_BCSR_INTERFACE_ID=4,
+	STARPU_VARIABLE_INTERFACE_ID=5,
+	STARPU_VOID_INTERFACE_ID=6,
+	STARPU_MULTIFORMAT_INTERFACE_ID=7,
+	STARPU_MAX_INTERFACE_ID=8 /* maximum number of data interfaces */
+};
+
+struct starpu_data_interface_ops
+{
 	/* Register an existing interface into a data handle. */
-	void (*register_data_handle)(starpu_data_handle handle,
+	void (*register_data_handle)(starpu_data_handle_t handle,
 					uint32_t home_node, void *data_interface);
 	/* Allocate data for the interface on a given node. */
 	starpu_ssize_t (*allocate_data_on_node)(void *data_interface, uint32_t node);
@@ -98,39 +112,42 @@ struct starpu_data_interface_ops_t {
 	/* ram/cuda/spu/opencl synchronous and asynchronous transfer methods */
 	const struct starpu_data_copy_methods *copy_methods;
 	/* Return the current pointer (if any) for the handle on the given node. */
-	void * (*handle_to_pointer)(starpu_data_handle handle, uint32_t node);
+	void * (*handle_to_pointer)(starpu_data_handle_t handle, uint32_t node);
 	/* Return an estimation of the size of data, for performance models */
-	size_t (*get_size)(starpu_data_handle handle);
+	size_t (*get_size)(starpu_data_handle_t handle);
 	/* Return a 32bit footprint which characterizes the data size */
-	uint32_t (*footprint)(starpu_data_handle handle);
+	uint32_t (*footprint)(starpu_data_handle_t handle);
 	/* Compare the data size of two interfaces */
 	int (*compare)(void *data_interface_a, void *data_interface_b);
 	/* Dump the sizes of a handle to a file */
-	void (*display)(starpu_data_handle handle, FILE *f);
+	void (*display)(starpu_data_handle_t handle, FILE *f);
 #ifdef STARPU_USE_GORDON
 	/* Convert the data size to the spu size format */
 	int (*convert_to_gordon)(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss); 
 #endif
 	/* an identifier that is unique to each interface */
-	unsigned interfaceid;
+	enum starpu_data_interface_id interfaceid;
 	/* The size of the interface data descriptor */
 	size_t interface_size;
+
+	int is_multiformat;
+	struct starpu_multiformat_data_interface_ops* (*get_mf_ops)(void *data_interface);
 };
 
-void starpu_data_register(starpu_data_handle *handleptr, uint32_t home_node,
-				void *data_interface,
-				struct starpu_data_interface_ops_t *ops);
+/* Return the next available id for a data interface */
+int starpu_data_interface_get_next_id();
+
+void starpu_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, void *data_interface, struct starpu_data_interface_ops *ops);
+void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
 
 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
  * interface does not support this operation or data for this handle is not
  * allocated on that node. */
-void *starpu_handle_to_pointer(starpu_data_handle handle, uint32_t node);
+void *starpu_handle_to_pointer(starpu_data_handle_t handle, uint32_t node);
 
 /* Return the local pointer associated with HANDLE or NULL if HANDLE's
  * interface does not have data allocated locally */
-void *starpu_handle_get_local_ptr(starpu_data_handle handle);
-
-extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
+void *starpu_handle_get_local_ptr(starpu_data_handle_t handle);
 
 /* "node" means memory node: 0 for main RAM, then 1, 2, etc. for various GPUs,
  * etc.
@@ -139,10 +156,16 @@ extern struct starpu_data_interface_ops_t _starpu_interface_matrix_ops;
  * case 0 should be passed.
  */
 
-void *starpu_data_get_interface_on_node(starpu_data_handle handle, unsigned memory_node);
+void *starpu_data_get_interface_on_node(starpu_data_handle_t handle, unsigned memory_node);
+
+#ifdef STARPU_DEVEL
+#  warning the declaration below is needed for the spvm example (dw_block_spmv.c:110) which filters a data in sub-data with a different interface. However exposing a private object is certainly not something to do. Either turn the object public or find another way to specify interfaces when filtering
+#endif /* STARPU_DEVEL */
+extern struct starpu_data_interface_ops _starpu_interface_matrix_ops;
 
 /* Matrix interface for dense matrices */
-typedef struct starpu_matrix_interface_s {
+struct starpu_matrix_interface
+{
 	uintptr_t ptr;
         uintptr_t dev_handle;
         size_t offset;
@@ -150,28 +173,29 @@ typedef struct starpu_matrix_interface_s {
 	uint32_t ny;
 	uint32_t ld;
 	size_t elemsize;
-} starpu_matrix_interface_t;
+};
 
-void starpu_matrix_data_register(starpu_data_handle *handle, uint32_t home_node,
-                        uintptr_t ptr, uint32_t ld, uint32_t nx,
-                        uint32_t ny, size_t elemsize);
-uint32_t starpu_matrix_get_nx(starpu_data_handle handle);
-uint32_t starpu_matrix_get_ny(starpu_data_handle handle);
-uint32_t starpu_matrix_get_local_ld(starpu_data_handle handle);
-uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle handle);
-size_t starpu_matrix_get_elemsize(starpu_data_handle handle);
+void starpu_matrix_data_register(starpu_data_handle_t *handle, uint32_t home_node, uintptr_t ptr, uint32_t ld, uint32_t nx, uint32_t ny, size_t elemsize);
+uint32_t starpu_matrix_get_nx(starpu_data_handle_t handle);
+uint32_t starpu_matrix_get_ny(starpu_data_handle_t handle);
+uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle);
+uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle);
+size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
 
 /* helper methods */
-#define STARPU_MATRIX_GET_PTR(interface)	(((starpu_matrix_interface_t *)(interface))->ptr)
-#define STARPU_MATRIX_GET_NX(interface)	(((starpu_matrix_interface_t *)(interface))->nx)
-#define STARPU_MATRIX_GET_NY(interface)	(((starpu_matrix_interface_t *)(interface))->ny)
-#define STARPU_MATRIX_GET_LD(interface)	(((starpu_matrix_interface_t *)(interface))->ld)
-#define STARPU_MATRIX_GET_ELEMSIZE(interface)	(((starpu_matrix_interface_t *)(interface))->elemsize)
+#define STARPU_MATRIX_GET_PTR(interface)	(((struct starpu_matrix_interface *)(interface))->ptr)
+#define STARPU_MATRIX_GET_DEV_HANDLE(interface)	(((struct starpu_matrix_interface *)(interface))->dev_handle)
+#define STARPU_MATRIX_GET_OFFSET(interface)	(((struct starpu_matrix_interface *)(interface))->offset)
+#define STARPU_MATRIX_GET_NX(interface)	(((struct starpu_matrix_interface *)(interface))->nx)
+#define STARPU_MATRIX_GET_NY(interface)	(((struct starpu_matrix_interface *)(interface))->ny)
+#define STARPU_MATRIX_GET_LD(interface)	(((struct starpu_matrix_interface *)(interface))->ld)
+#define STARPU_MATRIX_GET_ELEMSIZE(interface)	(((struct starpu_matrix_interface *)(interface))->elemsize)
 
 
 /* BLOCK interface for 3D dense blocks */
 /* TODO: rename to 3dmatrix? */
-typedef struct starpu_block_interface_s {
+struct starpu_block_interface
+{
 	uintptr_t ptr;
         uintptr_t dev_handle;
         size_t offset;
@@ -181,76 +205,80 @@ typedef struct starpu_block_interface_s {
 	uint32_t ldy;	/* number of elements between two lines */
 	uint32_t ldz;	/* number of elements between two planes */
 	size_t elemsize;
-} starpu_block_interface_t;
-
-void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,
-                        uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
-                        uint32_t ny, uint32_t nz, size_t elemsize);
-uint32_t starpu_block_get_nx(starpu_data_handle handle);
-uint32_t starpu_block_get_ny(starpu_data_handle handle);
-uint32_t starpu_block_get_nz(starpu_data_handle handle);
-uint32_t starpu_block_get_local_ldy(starpu_data_handle handle);
-uint32_t starpu_block_get_local_ldz(starpu_data_handle handle);
-uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle);
-size_t starpu_block_get_elemsize(starpu_data_handle handle);
+};
+
+void starpu_block_data_register(starpu_data_handle_t *handle, uint32_t home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize);
+uint32_t starpu_block_get_nx(starpu_data_handle_t handle);
+uint32_t starpu_block_get_ny(starpu_data_handle_t handle);
+uint32_t starpu_block_get_nz(starpu_data_handle_t handle);
+uint32_t starpu_block_get_local_ldy(starpu_data_handle_t handle);
+uint32_t starpu_block_get_local_ldz(starpu_data_handle_t handle);
+uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle);
+size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 
 /* helper methods */
-#define STARPU_BLOCK_GET_PTR(interface)	(((starpu_block_interface_t *)(interface))->ptr)
-#define STARPU_BLOCK_GET_NX(interface)	(((starpu_block_interface_t *)(interface))->nx)
-#define STARPU_BLOCK_GET_NY(interface)	(((starpu_block_interface_t *)(interface))->ny)
-#define STARPU_BLOCK_GET_NZ(interface)	(((starpu_block_interface_t *)(interface))->nz)
-#define STARPU_BLOCK_GET_LDY(interface)	(((starpu_block_interface_t *)(interface))->ldy)
-#define STARPU_BLOCK_GET_LDZ(interface)	(((starpu_block_interface_t *)(interface))->ldz)
-#define STARPU_BLOCK_GET_ELEMSIZE(interface)	(((starpu_block_interface_t *)(interface))->elemsize)
+#define STARPU_BLOCK_GET_PTR(interface)	(((struct starpu_block_interface *)(interface))->ptr)
+#define STARPU_BLOCK_GET_DEV_HANDLE(interface)	(((struct starpu_block_interface *)(interface))->dev_handle)
+#define STARPU_BLOCK_GET_OFFSET(interface)	(((struct starpu_block_interface *)(interface))->offset)
+#define STARPU_BLOCK_GET_NX(interface)	(((struct starpu_block_interface *)(interface))->nx)
+#define STARPU_BLOCK_GET_NY(interface)	(((struct starpu_block_interface *)(interface))->ny)
+#define STARPU_BLOCK_GET_NZ(interface)	(((struct starpu_block_interface *)(interface))->nz)
+#define STARPU_BLOCK_GET_LDY(interface)	(((struct starpu_block_interface *)(interface))->ldy)
+#define STARPU_BLOCK_GET_LDZ(interface)	(((struct starpu_block_interface *)(interface))->ldz)
+#define STARPU_BLOCK_GET_ELEMSIZE(interface)	(((struct starpu_block_interface *)(interface))->elemsize)
 
 /* vector interface for contiguous (non-strided) buffers */
-typedef struct starpu_vector_interface_s {
+struct starpu_vector_interface
+{
 	uintptr_t ptr;
         uintptr_t dev_handle;
         size_t offset;
 	uint32_t nx;
 	size_t elemsize;
-} starpu_vector_interface_t;
+};
 
-void starpu_vector_data_register(starpu_data_handle *handle, uint32_t home_node,
-                        uintptr_t ptr, uint32_t nx, size_t elemsize);
-uint32_t starpu_vector_get_nx(starpu_data_handle handle);
-size_t starpu_vector_get_elemsize(starpu_data_handle handle);
-uintptr_t starpu_vector_get_local_ptr(starpu_data_handle handle);
+void starpu_vector_data_register(starpu_data_handle_t *handle, uint32_t home_node, uintptr_t ptr, uint32_t nx, size_t elemsize);
+uint32_t starpu_vector_get_nx(starpu_data_handle_t handle);
+size_t starpu_vector_get_elemsize(starpu_data_handle_t handle);
+uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
 /* helper methods */
-#define STARPU_VECTOR_GET_PTR(interface)	(((starpu_vector_interface_t *)(interface))->ptr)
-#define STARPU_VECTOR_GET_NX(interface)	(((starpu_vector_interface_t *)(interface))->nx)
-#define STARPU_VECTOR_GET_ELEMSIZE(interface)	(((starpu_vector_interface_t *)(interface))->elemsize)
+#define STARPU_VECTOR_GET_PTR(interface)	(((struct starpu_vector_interface *)(interface))->ptr)
+#define STARPU_VECTOR_GET_DEV_HANDLE(interface)	(((struct starpu_vector_interface *)(interface))->dev_handle)
+#define STARPU_VECTOR_GET_OFFSET(interface)	(((struct starpu_vector_interface *)(interface))->offset)
+#define STARPU_VECTOR_GET_NX(interface)	(((struct starpu_vector_interface *)(interface))->nx)
+#define STARPU_VECTOR_GET_ELEMSIZE(interface)	(((struct starpu_vector_interface *)(interface))->elemsize)
 
 /* variable interface for a single data (not a vector, a matrix, a list, ...) */
-typedef struct starpu_variable_interface_s {
+struct starpu_variable_interface
+{
 	uintptr_t ptr;
 	size_t elemsize;
-} starpu_variable_interface_t;
+	/* No dev_handle, since it can not be filtered, offset will always be zero */
+};
 
-void starpu_variable_data_register(starpu_data_handle *handle, uint32_t home_node,
-                        uintptr_t ptr, size_t elemsize);
-size_t starpu_variable_get_elemsize(starpu_data_handle handle);
-uintptr_t starpu_variable_get_local_ptr(starpu_data_handle handle);
+void starpu_variable_data_register(starpu_data_handle_t *handle, uint32_t home_node, uintptr_t ptr, size_t size);
+size_t starpu_variable_get_elemsize(starpu_data_handle_t handle);
+uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
 /* helper methods */
-#define STARPU_VARIABLE_GET_PTR(interface)	(((starpu_variable_interface_t *)(interface))->ptr)
-#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((starpu_variable_interface_t *)(interface))->elemsize)
+#define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
 
 /* void interface. There is no data really associated to that interface, but it
  * may be used as a synchronization mechanism. It also permits to express an
  * abstract piece of data that is managed by the application internally: this
  * makes it possible to forbid the concurrent execution of different tasks
  * accessing the same "void" data in read-write concurrently. */
-void starpu_void_data_register(starpu_data_handle *handleptr);
+void starpu_void_data_register(starpu_data_handle_t *handle);
 
 /* CSR interface for sparse matrices (compressed sparse row representation) */
-typedef struct starpu_csr_interface_s {
+struct starpu_csr_interface
+{
 	uint32_t nnz; /* number of non-zero entries */
 	uint32_t nrow; /* number of rows */
 	uintptr_t nzval; /* non-zero values */
-	uint32_t *colind; /* position of non-zero entried on the row */
+	uint32_t *colind; /* position of non-zero entries on the row */
 	uint32_t *rowptr; /* index (in nzval) of the first entry of the row */
 
         /* k for k-based indexing (0 or 1 usually) */
@@ -258,29 +286,29 @@ typedef struct starpu_csr_interface_s {
         uint32_t firstentry;
 
 	size_t elemsize;
-} starpu_csr_interface_t;
-
-void starpu_csr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
-		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);
-uint32_t starpu_csr_get_nnz(starpu_data_handle handle);
-uint32_t starpu_csr_get_nrow(starpu_data_handle handle);
-uint32_t starpu_csr_get_firstentry(starpu_data_handle handle);
-uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle);
-uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle);
-uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle);
-size_t starpu_csr_get_elemsize(starpu_data_handle handle);
-
-#define STARPU_CSR_GET_NNZ(interface)	(((starpu_csr_interface_t *)(interface))->nnz)
-#define STARPU_CSR_GET_NROW(interface)	(((starpu_csr_interface_t *)(interface))->nrow)
-#define STARPU_CSR_GET_NZVAL(interface)	(((starpu_csr_interface_t *)(interface))->nzval)
-#define STARPU_CSR_GET_COLIND(interface)	(((starpu_csr_interface_t *)(interface))->colind)
-#define STARPU_CSR_GET_ROWPTR(interface)	(((starpu_csr_interface_t *)(interface))->rowptr)
-#define STARPU_CSR_GET_FIRSTENTRY(interface)	(((starpu_csr_interface_t *)(interface))->firstentry)
-#define STARPU_CSR_GET_ELEMSIZE(interface)	(((starpu_csr_interface_t *)(interface))->elemsize)
+};
+
+void starpu_csr_data_register(starpu_data_handle_t *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);
+uint32_t starpu_csr_get_nnz(starpu_data_handle_t handle);
+uint32_t starpu_csr_get_nrow(starpu_data_handle_t handle);
+uint32_t starpu_csr_get_firstentry(starpu_data_handle_t handle);
+uintptr_t starpu_csr_get_local_nzval(starpu_data_handle_t handle);
+uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle);
+uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle);
+size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
+
+#define STARPU_CSR_GET_NNZ(interface)	(((struct starpu_csr_interface *)(interface))->nnz)
+#define STARPU_CSR_GET_NROW(interface)	(((struct starpu_csr_interface *)(interface))->nrow)
+#define STARPU_CSR_GET_NZVAL(interface)	(((struct starpu_csr_interface *)(interface))->nzval)
+#define STARPU_CSR_GET_COLIND(interface)	(((struct starpu_csr_interface *)(interface))->colind)
+#define STARPU_CSR_GET_ROWPTR(interface)	(((struct starpu_csr_interface *)(interface))->rowptr)
+#define STARPU_CSR_GET_FIRSTENTRY(interface)	(((struct starpu_csr_interface *)(interface))->firstentry)
+#define STARPU_CSR_GET_ELEMSIZE(interface)	(((struct starpu_csr_interface *)(interface))->elemsize)
 
 /* BCSR interface for sparse matrices (blocked compressed sparse row
  * representation) */
-typedef struct starpu_bcsr_interface_s {
+struct starpu_bcsr_interface
+{
 	uint32_t nnz; /* number of non-zero BLOCKS */
 	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
 
@@ -298,35 +326,74 @@ typedef struct starpu_bcsr_interface_s {
 	uint32_t c;
 
 	size_t elemsize;
-} starpu_bcsr_interface_t;
+};
 
-void starpu_bcsr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
-		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);
+void starpu_bcsr_data_register(starpu_data_handle_t *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);
+
+#define STARPU_BCSR_GET_NNZ(interface)        (((struct starpu_bcsr_interface *)(interface))->nnz)
+#define STARPU_BCSR_GET_NZVAL(interface)      (((struct starpu_bcsr_interface *)(interface))->nzval)
+#define STARPU_BCSR_GET_COLIND(interface)     (((struct starpu_bcsr_interface *)(interface))->colind)
+#define STARPU_BCSR_GET_ROWPTR(interface)     (((struct starpu_bcsr_interface *)(interface))->rowptr)
+uint32_t starpu_bcsr_get_nnz(starpu_data_handle_t handle);
+uint32_t starpu_bcsr_get_nrow(starpu_data_handle_t handle);
+uint32_t starpu_bcsr_get_firstentry(starpu_data_handle_t handle);
+uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle);
+uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle);
+uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle);
+uint32_t starpu_bcsr_get_r(starpu_data_handle_t handle);
+uint32_t starpu_bcsr_get_c(starpu_data_handle_t handle);
+size_t starpu_bcsr_get_elemsize(starpu_data_handle_t handle);
+
+/*
+ * Multiformat interface
+ */
+struct starpu_multiformat_data_interface_ops
+{
+	size_t cpu_elemsize;
+#ifdef STARPU_USE_OPENCL
+	size_t opencl_elemsize;
+	struct starpu_codelet *cpu_to_opencl_cl;
+	struct starpu_codelet *opencl_to_cpu_cl;
+#endif
+#ifdef STARPU_USE_CUDA
+	size_t cuda_elemsize;
+	struct starpu_codelet *cpu_to_cuda_cl;
+	struct starpu_codelet *cuda_to_cpu_cl;
+#endif
+};
 
+struct starpu_multiformat_interface
+{
+	void *cpu_ptr;
+#ifdef STARPU_USE_CUDA
+	void *cuda_ptr;
+#endif
+#ifdef STARPU_USE_OPENCL
+	void *opencl_ptr;
+#endif
+	uint32_t nx;
+	struct starpu_multiformat_data_interface_ops *ops;
+};
 
-uint32_t starpu_bcsr_get_nnz(starpu_data_handle);
-uint32_t starpu_bcsr_get_nrow(starpu_data_handle);
-uint32_t starpu_bcsr_get_firstentry(starpu_data_handle);
-uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle);
-uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle);
-uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle);
-uint32_t starpu_bcsr_get_r(starpu_data_handle);
-uint32_t starpu_bcsr_get_c(starpu_data_handle);
-size_t starpu_bcsr_get_elemsize(starpu_data_handle);
+void starpu_multiformat_data_register(starpu_data_handle_t *handle, uint32_t home_node, void *ptr, uint32_t nobjects, struct starpu_multiformat_data_interface_ops *format_ops);
 
-#define STARPU_MATRIX_INTERFACE_ID	0
-#define STARPU_BLOCK_INTERFACE_ID	1
-#define STARPU_VECTOR_INTERFACE_ID	2
-#define STARPU_CSR_INTERFACE_ID		3
-#define STARPU_BCSR_INTERFACE_ID	4
-#define STARPU_VARIABLE_INTERFACE_ID	5
-#define STARPU_VOID_INTERFACE_ID	6
-#define STARPU_NINTERFACES_ID		7 /* number of data interfaces */
+#define STARPU_MULTIFORMAT_GET_CPU_PTR(interface)  (((struct starpu_multiformat_interface *)(interface))->cpu_ptr)
 
-unsigned starpu_get_handle_interface_id(starpu_data_handle);
+#ifdef STARPU_USE_CUDA
+#define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->cuda_ptr)
+#endif
+
+#ifdef STARPU_USE_OPENCL
+#define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
+#endif
+
+#define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
+
+enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle);
 
 /* Lookup a ram pointer into a StarPU handle */
-extern starpu_data_handle starpu_data_lookup(const void *ptr);
+extern starpu_data_handle_t starpu_data_lookup(const void *ptr);
+
 
 #ifdef __cplusplus
 }

+ 61 - 0
include/starpu_deprecated_api.h

@@ -0,0 +1,61 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_DEPRECATED_API_H__
+#define __STARPU_DEPRECATED_API_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#warning Your application is still using deprecated types. Please update to use the latest API, e.g. using tools/dev/rename.sh
+
+typedef starpu_data_handle_t starpu_data_handle;
+typedef struct starpu_block_interface starpu_block_interface_t;
+typedef struct starpu_matrix_interface starpu_matrix_interface_t;
+typedef struct starpu_vector_interface starpu_vector_interface_t;
+typedef struct starpu_variable_interface starpu_variable_interface_t;
+typedef struct starpu_csr_interface starpu_csr_interface_t;
+typedef struct starpu_bcsr_interface starpu_bcsr_interface_t;
+typedef struct starpu_multiformat_interface starpu_multiformat_interface_t;
+#define starpu_machine_topology_s starpu_machine_topology
+#define starpu_htbl32_node_s starpu_htbl32_node
+#define starpu_history_list_t starpu_history_list
+#define starpu_buffer_descr_t starpu_buffer_descr
+#define starpu_history_list_t starpu_history_list
+#define starpu_regression_model_t starpu_regression_model
+#define starpu_per_arch_perfmodel_t starpu_per_arch_perfmodel
+#define starpu_perfmodel_t starpu_perfmodel
+#define starpu_sched_policy_s starpu_sched_policy
+#define starpu_data_interface_ops_t starpu_data_interface_ops
+
+typedef struct starpu_buffer_descr starpu_buffer_descr;
+typedef struct starpu_codelet starpu_codelet;
+typedef enum starpu_access_mode starpu_access_mode;
+
+#define starpu_print_bus_bandwidth     starpu_bus_print_bandwidth
+#define starpu_get_handle_interface_id starpu_handle_get_interface_id
+#define starpu_get_current_task        starpu_task_get_current
+#define starpu_unpack_cl_args          starpu_codelet_unpack_args
+#define starpu_pack_cl_args            starpu_codelet_pack_args
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_DEPRECATED_API_H__ */

+ 3 - 3
include/starpu_expert.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,10 +18,10 @@
 #define __STARPU_EXPERT_H__
 
 #include <starpu.h>
-#include <starpu_config.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 void starpu_wake_all_blocked_workers(void);

+ 69 - 0
include/starpu_fxt.h

@@ -0,0 +1,69 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_FXT_H__
+#define __STARPU_FXT_H__
+
+#include <starpu_perfmodel.h>
+
+#define STARPU_FXT_MAX_FILES	64
+
+struct starpu_fxt_codelet_event
+{
+	char symbol[256]; /* name of the codelet */
+	int workerid;
+	enum starpu_perf_archtype archtype;
+	uint32_t hash;
+	size_t size;
+	float time;
+};
+
+struct starpu_fxt_options
+{
+	unsigned per_task_colour;
+	unsigned no_counter;
+	unsigned no_bus;
+	unsigned ninputfiles;
+	char *filenames[STARPU_FXT_MAX_FILES];
+	char *out_paje_path;
+	char *distrib_time_path;
+	char *activity_path;
+	char *dag_path;
+
+	/* In case we are going to gather multiple traces (eg in the case of
+	 * MPI processes), we may need to prefix the name of the containers. */
+	char *file_prefix;
+	uint64_t file_offset;
+	int file_rank;
+
+	/*
+	 *	Output parameters
+	 */
+
+	char worker_names[STARPU_NMAXWORKERS][256]; 
+	enum starpu_perf_archtype worker_archtypes[STARPU_NMAXWORKERS];
+	int nworkers;
+
+	/* In case we want to dump the list of codelets to an external tool */
+	struct starpu_fxt_codelet_event **dumped_codelets;
+	long dumped_codelets_count;
+};
+
+void starpu_fxt_options_init(struct starpu_fxt_options *options);
+void starpu_fxt_generate_trace(struct starpu_fxt_options *options);
+
+#endif /* __STARPU_FXT_H__ */

+ 39 - 0
include/starpu_hash.h

@@ -0,0 +1,39 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __HASH_H__
+#define __HASH_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+/* Compute the CRC of a byte buffer seeded by the inputcrc "current state".
+ * The return value should be considered as the new "current state" for future
+ * CRC computation. */
+uint32_t starpu_crc32_be_n(void *input, size_t n, uint32_t inputcrc);
+
+/* Compute the CRC of a 32bit number seeded by the inputcrc "current state".
+ * The return value should be considered as the new "current state" for future
+ * CRC computation. */
+uint32_t starpu_crc32_be(uint32_t input, uint32_t inputcrc);
+
+/* Compute the CRC of a string seeded by the inputcrc "current state".  The
+ * return value should be considered as the new "current state" for future CRC
+ * computation. */
+uint32_t starpu_crc32_string(char *str, uint32_t inputcrc);
+
+#endif // __HASH_H__

+ 48 - 13
include/starpu_opencl.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -27,38 +27,39 @@
 #include <starpu_config.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-void starpu_opencl_display_error(const char *func, const char *msg, cl_int status);
+void starpu_opencl_display_error(const char *func, const char *file, int line, const char *msg, cl_int status);
 #define STARPU_OPENCL_DISPLAY_ERROR(status) \
-	starpu_opencl_display_error(__starpu_func__, NULL, status)
+	starpu_opencl_display_error(__starpu_func__, __FILE__, __LINE__, NULL, status)
 
-static inline void starpu_opencl_report_error(const char *func, const char *msg, cl_int status)
+static inline void starpu_opencl_report_error(const char *func, const char *file, int line, const char *msg, cl_int status)
 {
-        starpu_opencl_display_error(func, msg, status);
+	starpu_opencl_display_error(func, file, line, msg, status);
         assert(0);
 }
 #define STARPU_OPENCL_REPORT_ERROR(status)			\
-	starpu_opencl_display_error(__starpu_func__, NULL, status)
+	starpu_opencl_report_error(__starpu_func__, __FILE__, __LINE__, NULL, status)
 
 #define STARPU_OPENCL_REPORT_ERROR_WITH_MSG(msg, status)			\
-	starpu_opencl_display_error(__starpu_func__, msg, status)
+	starpu_opencl_report_error(__starpu_func__, __FILE__, __LINE__, msg, status)
 
-struct starpu_opencl_program {
+struct starpu_opencl_program
+{
         cl_program programs[STARPU_MAXOPENCLDEVS];
 };
 
+size_t starpu_opencl_get_global_mem_size(int devid);
 void starpu_opencl_get_context(int devid, cl_context *context);
 void starpu_opencl_get_device(int devid, cl_device_id *device);
 void starpu_opencl_get_queue(int devid, cl_command_queue *queue);
 void starpu_opencl_get_current_context(cl_context *context);
 void starpu_opencl_get_current_queue(cl_command_queue *queue);
 
-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs,
-					const char* build_options);
-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs,
-					  const char* build_options);
+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options);
+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options);
 int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 
 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);
@@ -66,6 +67,40 @@ int starpu_opencl_release_kernel(cl_kernel kernel);
 
 int starpu_opencl_collect_stats(cl_event event);
 
+/*
+ * Sets the arguments of an OpenCL kernel.
+ * Arguments to pass to the kernel should be given as follows :
+ * 
+ * 	size of the argument,  pointer to the argument
+ *
+ * 0 must be passed to this function after the kernel arguments.
+ *
+ * In case of failure, returns the id of the argument that could not be set,
+ * and sets "error" to the error returned. Otherwise, returns the number of 
+ * arguments that were set.
+ *
+ * Example :
+ * int n;
+ * cl_int err;
+ * cl_kernel kernel;
+ * n = starpu_opencl_set_kernel_args(&err, 2, &kernel,
+ *				     sizeof(foo), &foo,
+ *                                   sizeof(bar), &bar,
+ *                                   0);
+ * if (n != 2)
+ * 	fprintf(stderr, "Error : %d\n", err);
+ */
+int starpu_opencl_set_kernel_args(cl_int *err, cl_kernel *kernel, ...);
+
+cl_int starpu_opencl_allocate_memory(cl_mem *addr, size_t size, cl_mem_flags flags);
+
+cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event);
+
+cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event);
+
+cl_int starpu_opencl_copy_ram_to_opencl_async_sync(void *ptr, unsigned src_node, cl_mem buffer, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
+
+cl_int starpu_opencl_copy_opencl_to_ram_async_sync(cl_mem buffer, unsigned src_node, void *ptr, unsigned dst_node, size_t size, size_t offset, cl_event *event, int *ret);
 
 #ifdef __cplusplus
 }

+ 108 - 30
include/starpu_perfmodel.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,30 +19,34 @@
 #ifndef __STARPU_PERFMODEL_H__
 #define __STARPU_PERFMODEL_H__
 
-#include <starpu_config.h>
-#include <stdio.h>
 #include <starpu.h>
-#include <starpu_task.h>
+#include <stdio.h>
+
+#include <starpu_util.h>
 
 #if ! defined(_MSC_VER)
 #  include <pthread.h>
 #endif
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-struct starpu_htbl32_node_s;
-struct starpu_history_list_t;
-struct starpu_buffer_descr_t;
+struct starpu_task;
+
+struct starpu_htbl32_node;
+struct starpu_history_list;
+struct starpu_buffer_descr;
 
-/* 
+/*
    it is possible that we have multiple versions of the same kind of workers,
    for instance multiple GPUs or even different CPUs within the same machine
    so we do not use the archtype enum type directly for performance models
 */
 
-enum starpu_perf_archtype {
+enum starpu_perf_archtype
+{
 	STARPU_CPU_DEFAULT = 0,
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
@@ -51,9 +55,73 @@ enum starpu_perf_archtype {
 	STARPU_GORDON_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS
 };
 
+#ifdef __STDC_VERSION__
+#  if __STDC_VERSION__ > 199901L || STARPU_GNUC_PREREQ(4, 6)
+
+/* Make sure the following assertions hold, since StarPU relies on it.  */
+
+_Static_assert(STARPU_CPU_DEFAULT == 0,
+	       "invalid STARPU_CPU_DEFAULT value");
+_Static_assert(STARPU_CUDA_DEFAULT > STARPU_CPU_DEFAULT,
+	       "invalid STARPU_CPU_DEFAULT value");
+_Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
+	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
+
+#  endif
+#endif
+
 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)
 
-struct starpu_regression_model_t {
+struct starpu_history_entry
+{
+	//double measured;
+
+	/* mean_n = 1/n sum */
+	double mean;
+
+	/* n dev_n = sum2 - 1/n (sum)^2 */
+	double deviation;
+
+	/* sum of samples */
+	double sum;
+
+	/* sum of samples^2 */
+	double sum2;
+
+//	/* sum of ln(measured) */
+//	double sumlny;
+//
+//	/* sum of ln(size) */
+//	double sumlnx;
+//	double sumlnx2;
+//
+//	/* sum of ln(size) ln(measured) */
+//	double sumlnxlny;
+//
+	unsigned nsample;
+
+	uint32_t footprint;
+#ifdef STARPU_HAVE_WINDOWS
+	unsigned size; /* in bytes */
+#else
+	size_t size; /* in bytes */
+#endif
+};
+
+struct starpu_history_list
+{
+	struct starpu_history_list *next;
+	struct starpu_history_entry *entry;
+};
+
+struct starpu_model_list
+{
+	struct starpu_model_list *next;
+	struct starpu_perfmodel *model;
+};
+
+struct starpu_regression_model
+{
 	/* sum of ln(measured) */
 	double sumlny;
 
@@ -61,6 +129,10 @@ struct starpu_regression_model_t {
 	double sumlnx;
 	double sumlnx2;
 
+	/* minimum/maximum(size) */
+	unsigned long minx;
+	unsigned long maxx;
+
 	/* sum of ln(size) ln(measured) */
 	double sumlnxlny;
 
@@ -76,36 +148,43 @@ struct starpu_regression_model_t {
 	unsigned nsample;
 };
 
-struct starpu_per_arch_perfmodel_t {
-	double (*cost_model)(struct starpu_buffer_descr_t *t); /* returns expected duration in µs */
+struct starpu_per_arch_perfmodel
+{
+	double (*cost_model)(struct starpu_buffer_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
+	double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl); /* returns expected duration in µs */
+	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
 
 	/* internal variables */
-	double alpha;
-	struct starpu_htbl32_node_s *history;
-	struct starpu_history_list_t *list;
-	struct starpu_regression_model_t regression;
+	struct starpu_htbl32_node *history;
+	struct starpu_history_list *list;
+	struct starpu_regression_model regression;
 #ifdef STARPU_MODEL_DEBUG
-	FILE *debug_file;
+	char debug_path[256];
 #endif
 };
 
-typedef enum {
+enum starpu_perfmodel_type
+{
 	STARPU_PER_ARCH,	/* Application-provided per-arch cost model function */
 	STARPU_COMMON,		/* Application-provided common cost model function, with per-arch factor */
 	STARPU_HISTORY_BASED,	/* Automatic history-based cost model */
 	STARPU_REGRESSION_BASED,	/* Automatic linear regression-based cost model  (alpha * size ^ beta) */
 	STARPU_NL_REGRESSION_BASED	/* Automatic non-linear regression-based cost model (a * size ^ b + c) */
-} starpu_perfmodel_type;
+};
 
-struct starpu_perfmodel_t {
+struct starpu_perfmodel
+{
 	/* which model is used for that task ? */
-	starpu_perfmodel_type type;
+	enum starpu_perfmodel_type type;
 
 	/* single cost model (STARPU_COMMON), returns expected duration in µs */
-	double (*cost_model)(struct starpu_buffer_descr_t *);
+	double (*cost_model)(struct starpu_buffer_descr *) STARPU_DEPRECATED;
+	double (*cost_function)(struct starpu_task *, unsigned nimpl);
+
+	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 
 	/* per-architecture model */
-	struct starpu_per_arch_perfmodel_t per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
+	struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	const char *symbol;
@@ -125,14 +204,13 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
 
 /* This function is intended to be used by external tools that should read the
  * performance model files */
-int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel_t *model);
-void starpu_perfmodel_debugfilepath(struct starpu_perfmodel_t *model,
-		enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
-void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch,	char *archname, size_t maxlen, unsigned nimpl);
-int starpu_list_models(void);
+int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel *model);
+void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
+void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
+int starpu_list_models(FILE *output);
 
 void starpu_force_bus_sampling(void);
-void starpu_print_bus_bandwidth(FILE *f);
+void starpu_bus_print_bandwidth(FILE *f);
 
 #ifdef __cplusplus
 }

+ 11 - 8
include/starpu_profiling.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,19 +18,20 @@
 #ifndef __STARPU_PROFILING_H__
 #define __STARPU_PROFILING_H__
 
+#include <starpu.h>
 #include <errno.h>
 #include <sys/time.h>
 
-#include <starpu.h>
-
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 #define STARPU_PROFILING_DISABLE	0
 #define STARPU_PROFILING_ENABLE		1
 
-struct starpu_task_profiling_info {
+struct starpu_task_profiling_info
+{
 	/* Task submission */
 	struct timespec submit_time;
 
@@ -65,7 +66,8 @@ struct starpu_task_profiling_info {
 };
 
 /* The timing is provided since the previous call to starpu_worker_get_profiling_info */
-struct starpu_worker_profiling_info {
+struct starpu_worker_profiling_info
+{
 	struct timespec start_time;
 	struct timespec total_time;
 	struct timespec executing_time;
@@ -77,7 +79,8 @@ struct starpu_worker_profiling_info {
 	double power_consumed;
 };
 
-struct starpu_bus_profiling_info {
+struct starpu_bus_profiling_info
+{
 	struct timespec start_time;
 	struct timespec total_time;
 	int long long transferred_bytes;
@@ -89,7 +92,7 @@ void starpu_set_profiling_id(int new_id);
 
 /* This function sets the profiling status:
  * - enable with STARPU_PROFILING_ENABLE
- * - disable with STARPU_PROFILING_DISABLE 
+ * - disable with STARPU_PROFILING_DISABLE
  * Negative return values indicate an error, otherwise the previous status is
  * returned. Calling this function resets the profiling measurements. */
 int starpu_profiling_status_set(int status);

+ 28 - 15
include/starpu_scheduler.h

@@ -20,7 +20,6 @@
 #define __STARPU_SCHEDULER_H__
 
 #include <starpu.h>
-#include <starpu_config.h>
 
 #if ! defined(_MSC_VER)
 #  include <pthread.h>
@@ -30,9 +29,15 @@
 #include <hwloc.h>
 #endif
 
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 struct starpu_task;
 
-struct starpu_machine_topology_s {
+struct starpu_machine_topology
+{
 	unsigned nworkers;
 
 	unsigned ncombinedworkers;
@@ -56,7 +61,7 @@ struct starpu_machine_topology_s {
 
 	/* Where to bind workers ? */
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
-	
+
 	/* Which GPU(s) do we use for CUDA ? */
 	unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS];
 
@@ -67,7 +72,8 @@ struct starpu_machine_topology_s {
 /* This structure contains all the methods that implement a scheduling policy.
  * An application may specify which scheduling strategy in the "sched_policy"
  * field of the starpu_conf structure passed to the starpu_init function. */
-struct starpu_sched_policy_s {
+struct starpu_sched_policy
+{
 	/* Initialize the scheduling policy. */
 	void (*init_sched)(unsigned sched_ctx_id);
 
@@ -75,7 +81,8 @@ struct starpu_sched_policy_s {
 	void (*deinit_sched)(unsigned sched_ctx_id);
 
 	/* Insert a task into the scheduler. */
-        int (*push_task)(struct starpu_task *);
+	int (*push_task)(struct starpu_task *);
+
 	/* Notify the scheduler that a task was directly pushed to the worker
 	 * without going through the scheduler. This method is called when a
 	 * task is explicitely assigned to a worker. This method therefore
@@ -83,18 +90,20 @@ struct starpu_sched_policy_s {
 	 * when StarPU bypasses the scheduling strategy. */
 	void (*push_task_notify)(struct starpu_task *, int workerid);
 
-
 	/* Get a task from the scheduler. The mutex associated to the worker is
 	 * already taken when this method is called. */
-	struct starpu_task *(*pop_task)();
+	struct starpu_task *(*pop_task)(void);
 
 	 /* Remove all available tasks from the scheduler (tasks are chained by
 	  * the means of the prev and next fields of the starpu_task
 	  * structure). The mutex associated to the worker is already taken
 	  * when this method is called. */
-	struct starpu_task *(*pop_every_task)();
+	struct starpu_task *(*pop_every_task)(void);
+
+	/* This method is called every time a task is starting. (optional) */
+	void (*pre_exec_hook)(struct starpu_task *);
 
-	/* This method is called every time a task has been executed. (optionnal) */
+	/* This method is called every time a task has been executed. (optional) */
 	void (*post_exec_hook)(struct starpu_task *);
 
 	/* Initialize the scheduling policy for added workers. */
@@ -176,7 +185,7 @@ unsigned starpu_get_nworkers_of_sched_ctx(unsigned sched_ctx);
 unsigned starpu_get_nshared_workers(unsigned sched_ctx_id, unsigned sched_ctx_id2);
 
 /* Check if the worker specified by workerid can execute the codelet. */
-int starpu_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
+int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /* The scheduling policy may put tasks directly into a worker's local queue so
  * that it is not always necessary to create its own queue when the local queue
@@ -209,12 +218,10 @@ void starpu_sched_set_max_priority(int max_prio);
 
 /* Register a new combined worker and get its identifier */
 int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[]);
-/* Initialize combined workers */
-void _starpu_sched_find_worker_combinations(struct starpu_machine_topology_s *topology);
 /* Get the description of a combined worker */
 int starpu_combined_worker_get_description(int workerid, int *worker_size, int **combined_workerid);
-/* Variant of starpu_worker_may_execute_task compatible with combined workers */
-int starpu_combined_worker_may_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
+/* Variant of starpu_worker_can_execute_task compatible with combined workers */
+int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 
 /*
  *	Data prefetching
@@ -238,9 +245,15 @@ double starpu_worker_get_relative_speedup(enum starpu_perf_archtype perf_archtyp
 /* Returns expected data transfer time in µs */
 double starpu_task_expected_data_transfer_time(uint32_t memory_node, struct starpu_task *task);
 /* Predict the transfer time (in µs) to move a handle to a memory node */
-double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned memory_node, starpu_access_mode mode);
+double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned memory_node, enum starpu_access_mode mode);
 /* Returns expected power consumption in J */
 double starpu_task_expected_power(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+/* Returns expected conversion time in ms (multiformat interface only) */
+double starpu_task_expected_conversion_time(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
+
+#ifdef __cplusplus
+}
+#endif
 
 /* Waits until all the tasks of a worker, already submitted, have been executed */
 int starpu_wait_for_all_tasks_of_worker(int workerid);

+ 80 - 51
include/starpu_task.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  INRIA
  *
@@ -20,71 +20,77 @@
 #ifndef __STARPU_TASK_H__
 #define __STARPU_TASK_H__
 
-#include <errno.h>
 #include <starpu.h>
-#include <starpu_config.h>
+#include <starpu_data.h>
+#include <starpu_task_bundle.h>
+#include <errno.h>
 
 #if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
 # include <cuda.h>
 #endif
 
-#include <starpu_data.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
 
 #define STARPU_CPU	((1ULL)<<1)
 #define STARPU_CUDA	((1ULL)<<3)
-#define STARPU_SPU	((1ULL)<<4)
-#define STARPU_GORDON	((1ULL)<<5)
+#define	STARPU_SPU	((1ULL)<<4),
+#define	STARPU_GORDON	((1ULL)<<5)
 #define STARPU_OPENCL	((1ULL)<<6)
 
 /* Codelet types */
-#define STARPU_SEQ		0
-#define STARPU_SPMD		1
-#define STARPU_FORKJOIN		2
+enum starpu_codelet_type
+{
+	STARPU_SEQ,
+	STARPU_SPMD,
+	STARPU_FORKJOIN
+};
 
 /* task status */
-#define STARPU_TASK_INVALID	0
-#define STARPU_TASK_BLOCKED	1
-#define STARPU_TASK_READY	2
-#define STARPU_TASK_RUNNING	3
-#define STARPU_TASK_FINISHED	4
-
-#define STARPU_TASK_BLOCKED_ON_TAG	5
-#define STARPU_TASK_BLOCKED_ON_TASK	6
-#define STARPU_TASK_BLOCKED_ON_DATA	7
-
-#ifdef __cplusplus
-extern "C" {
-#endif
+enum starpu_task_status
+{
+	STARPU_TASK_INVALID,
+	STARPU_TASK_BLOCKED,
+	STARPU_TASK_READY,
+	STARPU_TASK_RUNNING,
+	STARPU_TASK_FINISHED,
+	STARPU_TASK_BLOCKED_ON_TAG,
+	STARPU_TASK_BLOCKED_ON_TASK,
+	STARPU_TASK_BLOCKED_ON_DATA
+};
 
 typedef uint64_t starpu_tag_t;
 
-
 typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
 typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
 typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
 typedef uint8_t starpu_gordon_func_t; /* Cell SPU */
 
-#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    (starpu_cpu_func_t) -1
-#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   (starpu_cuda_func_t) -1
-#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS (starpu_opencl_func_t) -1
+#define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
+#define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
+#define STARPU_MULTIPLE_OPENCL_IMPLEMENTATIONS ((starpu_opencl_func_t) -1)
 #define STARPU_MULTIPLE_GORDON_IMPLEMENTATIONS 255
 
-
 /*
- * A codelet describes the various function 
+ * A codelet describes the various function
  * that may be called from a worker
  */
-typedef struct starpu_codelet_t {
+struct starpu_task;
+struct starpu_codelet
+{
 	/* where can it be performed ? */
 	uint32_t where;
-	unsigned type;
+	int (*can_execute)(unsigned workerid, struct starpu_task *task, unsigned nimpl);
+	enum starpu_codelet_type type;
 	int max_parallelism;
 
 	/* the different implementations of the codelet */
-	void (*cuda_func)(void **, void *);
-	void (*cpu_func)(void **, void *);
-	void (*opencl_func)(void **, void *);
-	uint8_t gordon_func;
+	starpu_cuda_func_t cuda_func STARPU_DEPRECATED;
+	starpu_cpu_func_t cpu_func STARPU_DEPRECATED;
+	starpu_opencl_func_t opencl_func STARPU_DEPRECATED;
+	uint8_t gordon_func STARPU_DEPRECATED;
 
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
@@ -93,31 +99,41 @@ typedef struct starpu_codelet_t {
 
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
+	/* which are the access modes for these buffers */
+	enum starpu_access_mode modes[STARPU_NMAXBUFS];
 
 	/* performance model of the codelet */
-	struct starpu_perfmodel_t *model;
+	struct starpu_perfmodel *model;
 	/* consumption model of the codelet.
 	 * In the case of parallel codelets, accounts for all units. */
-	struct starpu_perfmodel_t *power_model;
+	struct starpu_perfmodel *power_model;
 
 	/* statistics collected at runtime: this is filled by StarPU and should
 	 * not be accessed directly (use the starpu_display_codelet_stats
 	 * function instead for instance). */
 	unsigned long per_worker_stats[STARPU_NMAXWORKERS];
-} starpu_codelet;
 
-struct starpu_task {
-	struct starpu_codelet_t *cl;
+	const char *name;
+};
+
+#ifdef STARPU_GCC_PLUGIN
+typedef struct starpu_codelet starpu_codelet_gcc;
+#endif /* STARPU_GCC_PLUGIN */
+
+struct starpu_task
+{
+	struct starpu_codelet *cl;
 
 	/* arguments managed by the DSM */
-	struct starpu_buffer_descr_t buffers[STARPU_NMAXBUFS];
+	struct starpu_buffer_descr buffers[STARPU_NMAXBUFS] STARPU_DEPRECATED;
+	starpu_data_handle_t handles[STARPU_NMAXBUFS];
 	void *interfaces[STARPU_NMAXBUFS];
 
 	/* arguments not managed by the DSM are given as a buffer */
 	void *cl_arg;
 	/* in case the argument buffer has to be uploaded explicitely */
 	size_t cl_arg_size;
-	
+
 	/* when the task is done, callback_func(callback_arg) is called */
 	void (*callback_func)(void *);
 	void *callback_arg;
@@ -127,7 +143,7 @@ struct starpu_task {
 
 	/* options for the task execution */
 	unsigned synchronous; /* if set, a call to push is blocking */
-	int priority; /* STARPU_MAX_PRIO = most important 
+	int priority; /* STARPU_MAX_PRIO = most important
         		: STARPU_MIN_PRIO = least important */
 
 	/* in case the task has to be executed on a specific worker */
@@ -135,7 +151,7 @@ struct starpu_task {
 	unsigned workerid;
 
 	/* Bundle including the task */
-	struct starpu_task_bundle *bundle;
+	starpu_task_bundle_t bundle;
 
 	/* If this flag is set, it is not possible to synchronize with the task
 	 * by the means of starpu_task_wait later on. Internal data structures
@@ -154,10 +170,10 @@ struct starpu_task {
 
 	/* If this flag is set, the task will be re-submitted to StarPU once it
 	 * has been executed. This flag must not be set if the destroy flag is
-	 * set too. */ 
+	 * set too. */
 	int regenerate;
 
-	unsigned status;
+	enum starpu_task_status status;
 
 	/* This gets filled when profiling is enabled by using
 	 * starpu_profiling_status_set */
@@ -167,14 +183,21 @@ struct starpu_task {
 	 * scheduling strategy uses performance models. */
 	double predicted;
 
+	/* Predicted data transfer duration for the task in µs. This field is
+	 * only valid if the scheduling strategy uses performance models. */
+	double predicted_transfer;
+
 	/* This field are provided for the convenience of the scheduler. */
 	struct starpu_task *prev;
 	struct starpu_task *next;
 
+	unsigned int mf_skip;
+
 	/* this is private to StarPU, do not modify. If the task is allocated
 	 * by hand (without starpu_task_create), this field should be set to
 	 * NULL. */
 	void *starpu_private;
+	int magic;
 
 	/* Scheduling context */
 	unsigned sched_ctx;
@@ -209,11 +232,13 @@ struct starpu_task {
 	.status = STARPU_TASK_INVALID,			\
 	.profiling_info = NULL,				\
 	.predicted = -1.0,				\
+	.predicted_transfer = -1.0,			\
 	.starpu_private = NULL,				\
+	.magic = 42                  			\
 	.sched_ctx = 0,					\
 	.control_task = 0,				\
-		.hypervisor_tag = 0,			\
-		.flops = 0.0			\
+	.hypervisor_tag = 0,			\
+	.flops = 0.0			\
 };
 
 /*
@@ -275,9 +300,8 @@ struct starpu_task *starpu_task_create(void);
  * structure (default behaviour). Calling this function on a statically
  * allocated task results in an undefined behaviour. */
 void starpu_task_destroy(struct starpu_task *task);
-
 int starpu_task_submit(struct starpu_task *task);
-	
+
 /* This function blocks until the task was executed. It is not possible to
  * synchronize with a task more than once. It is not possible to wait
  * synchronous or detached tasks.
@@ -289,16 +313,21 @@ int starpu_task_wait(struct starpu_task *task);
  * been executed. */
 int starpu_task_wait_for_all(void);
 
+/* This function waits until there is no more ready task. */
+int starpu_task_wait_for_no_ready(void);
+
 /* This function waits until all the tasks that were already submitted to a specific
  * context have been executed. */
 int starpu_wait_for_all_tasks_of_sched_ctx(unsigned sched_ctx_id);
 
-void starpu_display_codelet_stats(struct starpu_codelet_t *cl);
+void starpu_codelet_init(struct starpu_codelet *cl);
+
+void starpu_display_codelet_stats(struct starpu_codelet *cl);
 
 /* Return the task currently executed by the worker, or NULL if this is called
  * either from a thread that is not a task or simply because there is no task
  * being executed at the moment. */
-struct starpu_task *starpu_get_current_task(void);
+struct starpu_task *starpu_task_get_current(void);
 
 #ifdef __cplusplus
 }

+ 95 - 51
include/starpu_task_bundle.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
+ * Copyright (C) 2012  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -18,64 +19,107 @@
 #ifndef __STARPU_TASK_BUNDLE_H__
 #define __STARPU_TASK_BUNDLE_H__
 
-#include <starpu.h>
-#include <starpu_config.h>
-
-#if ! defined(_MSC_VER)
-#  include <pthread.h>
+#ifdef __cplusplus
+extern "C"
+{
 #endif
 
-struct starpu_task_bundle_entry {
-	struct starpu_task *task;
-	struct starpu_task_bundle_entry *next;
-};
-
-/* The task bundle structure describes a list of tasks that should be scheduled
- * together whenever possible. */
-struct starpu_task_bundle {
-	/* Mutex protecting the bundle */
-#if defined(_MSC_VER)
-	void *mutex;
-#else
-	pthread_mutex_t mutex;
-#endif
-	/* last worker previously assigned a task from the bundle (-1 if none) .*/
-	int previous_workerid;
-	/* list of tasks */
-	struct starpu_task_bundle_entry *list;
-	/* If this flag is set, the bundle structure is automatically free'd
-	 * when the bundle is deinitialized. */
-	int destroy;
-	/* Is the bundle closed ? */
-	int closed;
-	/* TODO retain bundle (do not schedule until closed) */
-};
+struct starpu_task;
 
-/* Initialize a task bundle */
-void starpu_task_bundle_init(struct starpu_task_bundle *bundle);
+/* starpu_task_bundle_t
+ * ==================
+ * Purpose
+ * =======
+ * Opaque structure describing a list of tasks that should be scheduled
+ * on the same worker whenever it's possible.
+ * It must be considered as a hint given to the scheduler as there is no guarantee that
+ * they will be executed on the same worker.
+ */
+typedef struct _starpu_task_bundle *starpu_task_bundle_t;
 
-/* Deinitialize a bundle. In case the destroy flag is set, the bundle structure
- * is freed too. */
-void starpu_task_bundle_deinit(struct starpu_task_bundle *bundle);
+/* starpu_task_bundle_create
+ * =========================
+ * Purpose
+ * =======
+ * Factory function creating a bundle, when the call return,
+ * memory needed is allocated and the bundle is ready to use.
+ *
+ * Arguments
+ * =========
+ * bundle		(output)
+ * 			Bundle to create and initialize.
+ */
+void starpu_task_bundle_create(starpu_task_bundle_t *bundle);
 
-/* Insert a task into a bundle. */
-int starpu_task_bundle_insert(struct starpu_task_bundle *bundle, struct starpu_task *task);
+/* starpu_task_bundle_insert
+ * =========================
+ * Purpose
+ * =======
+ * Insert a task in a bundle. Until the task is removed from the bundle
+ * its expected length and data transfer time will be considered along
+ * those of the other tasks of the bundle.
+ * This function mustn't be called if the bundle is already closed and/or
+ * the task is already submitted.
+ *
+ * Return value
+ * ============
+ * On success, it returns 0.
+ * There are two cases of error :
+ * 	- if the bundle is already closed it returns -EPERM
+ * 	- if the task was already submitted it return -EINVAL.
+ *
+ * Arguments
+ * =========
+ * bundle		(input)
+ * 			Bundle where to insert the task.
+ *
+ * task			(input)
+ * 			Task to insert.
+ */
+int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *task);
 
-/* Remove a task from a bundle. This method must be called with bundle->mutex
- * hold. This function returns 0 if the task was found, -ENOENT if the element
- * was not found, 1 if the element is found and if the list was deinitialized
- * because it became empty. */
-int starpu_task_bundle_remove(struct starpu_task_bundle *bundle, struct starpu_task *task);
+/* starpu_task_bundle_remove
+ * =========================
+ * Purpose
+ * =======
+ * Remove the tasks passed as argument from the bundle.
+ * Of course the task must have been previously inserted in the bundle.
+ * This function mustn't be called if the bundle is already closed and/or
+ * the task is already submitted. Doing so would result in undefined behaviour.
+ *
+ * Return value
+ * ============
+ * On success, it returns 0.
+ * If the bundle is already closed it returns -ENOENT.
+ *
+ * Arguments
+ * =========
+ * bundle		(input)
+ * 			Bundle containing the task.
+ *
+ * task			(input)
+ * 			The task to remove.
+ */
+int starpu_task_bundle_remove(starpu_task_bundle_t bundle, struct starpu_task *task);
 
-/* Close a bundle. No task can be added to a closed bundle. A closed bundle
- * automatically gets deinitialized when it becomes empty. */
-void starpu_task_bundle_close(struct starpu_task_bundle *bundle);
+/* starpu_task_bundle_close
+ * =========================
+ * Purpose
+ * =======
+ * Calling this functions informs the runtime that the user
+ * won't modify the bundle anymore, it means no more
+ * inserting or removing a task.
+ * Thus the runtime can destroy it when needed.
+ *
+ * Arguments
+ * =========
+ * bundle		(input)
+ * 			Bundle to close.
+ */
+void starpu_task_bundle_close(starpu_task_bundle_t bundle);
 
-/* Return the expected duration of the entire task bundle in µs. */
-double starpu_task_bundle_expected_length(struct starpu_task_bundle *bundle, enum starpu_perf_archtype arch, unsigned nimpl);
-/* Return the time (in µs) expected to transfer all data used within the bundle */
-double starpu_task_bundle_expected_data_transfer_time(struct starpu_task_bundle *bundle, unsigned memory_node);
-/* Return the expected power consumption of the entire task bundle in J. */
-double starpu_task_bundle_expected_power(struct starpu_task_bundle *bundle,  enum starpu_perf_archtype arch, unsigned nimpl);
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_TASK_BUNDLE_H__

+ 23 - 4
include/starpu_task_list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010-2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,7 +19,13 @@
 
 #include <starpu_task.h>
 
-struct starpu_task_list {
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct starpu_task_list
+{
 	struct starpu_task *head;
 	struct starpu_task *tail;
 };
@@ -28,7 +34,7 @@ struct starpu_task_list {
 void starpu_task_list_init(struct starpu_task_list *list);
 
 /* Push a task at the front of a list */
-void starpu_task_list_push_front(struct starpu_task_list *list,	struct starpu_task *task);
+void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
 
 /* Push a task at the back of a list */
 void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
@@ -50,5 +56,18 @@ struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
 
 /* Remove the element at the back of the list */
 struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
-						
+
+/* Get the first task of the list */
+struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list);
+
+/* Get the end of the list */
+struct starpu_task *starpu_task_list_end(struct starpu_task_list *list);
+
+/* Get the next task of the list. This is not erase-safe. */
+struct starpu_task *starpu_task_list_next(struct starpu_task *task);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* __STARPU_TASK_LIST_H__ */

+ 72 - 124
include/starpu_top.h

@@ -17,23 +17,24 @@
 
 #ifndef __STARPU_TOP_H__
 #define __STARPU_TOP_H__
+
+#include <starpu.h>
 #include <stdlib.h>
 #include <time.h>
-#include <starpu.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
- 
-typedef enum
+enum starpu_top_data_type
 {
-	STARPUTOP_DATA_BOOLEAN,
-	STARPUTOP_DATA_INTEGER,
-	STARPUTOP_DATA_FLOAT
-} starputop_data_type;
+	STARPU_TOP_DATA_BOOLEAN,
+	STARPU_TOP_DATA_INTEGER,
+	STARPU_TOP_DATA_FLOAT
+};
 
-typedef struct starputop_data_t
+struct starpu_top_data
 {
 	unsigned int id;
 	const char* name;
@@ -42,35 +43,35 @@ typedef struct starputop_data_t
 	double double_min_value;
 	double double_max_value;
 	int active;
-	starputop_data_type type;
-	struct starputop_data_t * next;
-} starputop_data;
+	enum starpu_top_data_type type;
+	struct starpu_top_data * next;
+};
 
-typedef enum
+enum starpu_top_param_type
 {
-	STARPUTOP_PARAM_BOOLEAN,
-	STARPUTOP_PARAM_INTEGER,
-	STARPUTOP_PARAM_FLOAT,
-	STARPUTOP_PARAM_ENUM
-} starputop_param_type;
+	STARPU_TOP_PARAM_BOOLEAN,
+	STARPU_TOP_PARAM_INTEGER,
+	STARPU_TOP_PARAM_FLOAT,
+	STARPU_TOP_PARAM_ENUM
+};
 
-typedef struct starputop_param_t
+struct starpu_top_param
 {
 	unsigned int id;
 	const char* name;
-	starputop_param_type type;
+	enum starpu_top_param_type type;
 	void* value;
 	char** enum_values; /* only for enum type can be NULL */
 	int nb_values;
-	void (*callback)(struct starputop_param_t*);
+	void (*callback)(struct starpu_top_param*);
 	int int_min_value; /* only for integer type */
 	int int_max_value;
 	double double_min_value; /*only for double type */
 	double double_max_value;
-	struct starputop_param_t * next;
-} starputop_param;
+	struct starpu_top_param * next;
+};
 
-typedef enum
+enum starpu_top_message_type
 {
 	TOP_TYPE_GO,
 	TOP_TYPE_SET,
@@ -78,19 +79,8 @@ typedef enum
 	TOP_TYPE_ENABLE,
 	TOP_TYPE_DISABLE,
 	TOP_TYPE_DEBUG,
-	TOP_TYPE_UNKNOW	
-} starputop_message_type;
-
-
-/* 
- * This function returns 1 if starpu_top is initialized. 0 otherwise.
- */
-int starpu_top_status_get();
-
-/*
- * Convert timespec to ms
- */
-unsigned long long starpu_timing_timespec_to_ms(const struct timespec *ts);
+	TOP_TYPE_UNKNOW
+};
 
 /*****************************************************
 ****   Functions to call BEFORE initialisation   *****
@@ -100,52 +90,49 @@ unsigned long long starpu_timing_timespec_to_ms(const struct timespec *ts);
  * If active=0, the value will NOT be displayed to user by default.
  * Any other value will make the value displayed by default.
 */
-starputop_data * starputop_add_data_boolean(
-			const char* data_name,
-			int active);
+struct starpu_top_data *starpu_top_add_data_boolean(const char* data_name,
+						    int active);
 /*
  * This fonction register a data named data_name of type integer
  * The minimum and maximum value will be usefull to define the scale in UI
  * If active=0, the value will NOT be displayed to user by default.
  * Any other value will make the value displayed by default.
 */
-starputop_data * starputop_add_data_integer(
-			const char* data_name, 
-			int minimum_value, 
-			int maximum_value, 
-			int active);
+struct starpu_top_data * starpu_top_add_data_integer(const char* data_name,
+						     int minimum_value,
+						     int maximum_value,
+						     int active);
 /*
  * This fonction register a data named data_name of type float
  * The minimum and maximum value will be usefull to define the scale in UI
  * If active=0, the value will NOT be displayed to user by default.
  * Any other value will make the value displayed by default.
 */
-starputop_data* starputop_add_data_float(const char* data_name, 
-			double minimum_value, 
-			double maximum_value, 
-			int active);
+struct starpu_top_data* starpu_top_add_data_float(const char* data_name,
+						  double minimum_value,
+						  double maximum_value,
+						  int active);
 
 /*
  * This fonction register a parameter named parameter_name, of type boolean.
- * The callback fonction will be called when the parameter is modified by UI, 
+ * The callback fonction will be called when the parameter is modified by UI,
  * and can be null.
 */
-starputop_param* starputop_register_parameter_boolean(
-			const char* param_name, 
-			int* parameter_field, 
-			void (*callback)(struct starputop_param_t*));
+struct starpu_top_param* starpu_top_register_parameter_boolean(const char* param_name,
+							       int* parameter_field,
+							       void (*callback)(struct starpu_top_param*));
 /*
  * This fonction register a parameter named param_name, of type integer.
  * Minimum and maximum value will be used to prevent user seting incorrect
  * value.
- * The callback fonction will be called when the parameter is modified by UI, 
+ * The callback fonction will be called when the parameter is modified by UI,
  * and can be null.
 */
-starputop_param* starputop_register_parameter_integer(const char* param_name, 
-			int* parameter_field, 
-			int minimum_value, 
-			int maximum_value,
-			void (*callback)(struct starputop_param_t*));
+struct starpu_top_param* starpu_top_register_parameter_integer(const char* param_name,
+							       int* parameter_field,
+							       int minimum_value,
+							       int maximum_value,
+							       void (*callback)(struct starpu_top_param*));
 /*
  * This fonction register a parameter named param_name, of type float.
  * Minimum and maximum value will be used to prevent user seting incorrect
@@ -153,12 +140,11 @@ starputop_param* starputop_register_parameter_integer(const char* param_name,
  * The callback fonction will be called when the parameter is modified by UI,
  * and can be null.
 */
-starputop_param* starputop_register_parameter_float(
-			const char* param_name, 
-			double* parameter_field, 
-			double minimum_value, 
-			double maximum_value, 
-			void (*callback)(struct starputop_param_t*));
+struct starpu_top_param* starpu_top_register_parameter_float(const char* param_name,
+							     double* parameter_field,
+							     double minimum_value,
+							     double maximum_value,
+							     void (*callback)(struct starpu_top_param*));
 
 /*
  * This fonction register a parameter named param_name, of type enum.
@@ -167,12 +153,11 @@ starputop_param* starputop_register_parameter_float(
  * The callback fonction will be called when the parameter is modified by UI,
  * and can be null.
 */
-starputop_param* starputop_register_parameter_enum(
-			const char* param_name, 
-			int* parameter_field, 
-			char** values,
-			int nb_values, 
-			void (*callback)(struct starputop_param_t*));
+struct starpu_top_param* starpu_top_register_parameter_enum(const char* param_name,
+							    int* parameter_field,
+							    char** values,
+							    int nb_values,
+							    void (*callback)(struct starpu_top_param*));
 
 
 
@@ -186,7 +171,7 @@ starputop_param* starputop_register_parameter_enum(
  * This function will wait for a TOP to connect, send initialisation
  * sentences, and wait for the GO message.
  */
-void starputop_init_and_wait(const char* server_name);
+void starpu_top_init_and_wait(const char* server_name);
 
 /****************************************************
 ************ To call after initialisation************
@@ -196,70 +181,33 @@ void starputop_init_and_wait(const char* server_name);
  * This function should be called after every modification
  * of a parameter from something other than starpu_top.
  * This fonction notice UI that the configuration changed
- */ 
-void starputop_update_parameter(const starputop_param* param);
-
-/*
- * This functions update the value of the starputop_data on UI
  */
-void starputop_update_data_boolean(
-			const starputop_data* data, 
-			int value);
-void starputop_update_data_integer(
-			const starputop_data* data, 
-			int value);
-void starputop_update_data_float(
-			const starputop_data* data, 
-			double value);
+void starpu_top_update_parameter(const struct starpu_top_param* param);
 
 /*
- * This functions notify UI than the task has started or ended
+ * This functions update the value of the starpu_top_data on UI
  */
-void starputop_task_started(
-			struct starpu_task *task, 
-			int devid, 
-			const struct timespec* ts);
-void starputop_task_ended(
-			struct starpu_task *task, 
-			int devid, 
-			const struct timespec* ts );
-/*
- * This functions notify UI than the task have been planed to 
- * run from timestamp_begin to timestamp_end, on computation-core
- */
-void starputop_task_prevision_timespec(
-			struct starpu_task *task, 
-			int devid, 
-			const struct timespec* start, 
-			const struct timespec* end);
-void starputop_task_prevision(
-			struct starpu_task *task, 
-			int devid, unsigned long long start, 
-			unsigned long long end);
+void starpu_top_update_data_boolean(const struct starpu_top_data* data,
+				    int value);
+void starpu_top_update_data_integer(const struct starpu_top_data* data,
+				    int value);
+void starpu_top_update_data_float(const struct starpu_top_data* data,
+				  double value);
 
- 
 /*
  * This functions are usefull in debug mode. The starpu developper doesn't need
  * to check if the debug mode is active.
- * This is checked by starputop itsefl.
- * 
+ * This is checked by starpu_top itsefl.
+ *
  * top_debug_log just send a message to display by UI
- * top_debug_lock send a message and wait for a continue message from UI 
+ * top_debug_lock send a message and wait for a continue message from UI
  * to return
- * 
+ *
  * The lock (wich create a stop-point) should be called only by the main thread.
  * Calling it from more than one thread is not supported.
  */
-void starputop_debug_log(const char* message);
-void starputop_debug_lock(const char* message);
-
-/****************************************************
-***************** Callback function *****************
-*****************************************************/
-
-void starputop_process_input_message(char *message);
-	
-	
+void starpu_top_debug_log(const char* message);
+void starpu_top_debug_lock(const char* message);
 
 
 #ifdef __cplusplus

+ 69 - 70
include/starpu_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,11 +23,11 @@
 #include <stdlib.h>
 #include <string.h>
 #include <assert.h>
-#include <starpu_config.h>
-#include <starpu_perfmodel.h>
+#include <starpu.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 #define STARPU_POISON_PTR	((void *)0xdeadbeef)
@@ -36,17 +36,47 @@ extern "C" {
 #define STARPU_MAX(a,b)	((a)<(b)?(b):(a))
 
 #ifdef STARPU_NO_ASSERT
-#define STARPU_ASSERT(x)	do {} while(0);
+#define STARPU_ASSERT(x)		do {} while(0)
+#define STARPU_ASSERT_MSG(x, msg)	do {} while(0)
 #else
 #  if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
-#    define STARPU_ASSERT(x)	do { if (!(x)) *(int*)NULL = 0; } while(0)
+#    define STARPU_ASSERT(x)		do { if (!(x)) *(int*)NULL = 0; } while(0)
+#    define STARPU_ASSERT_MSG(x, msg)	do { if (!(x)) { fprintf(stderr, "%s\n", msg); *(int*)NULL = 0; }} while(0)
 #  else
-#    define STARPU_ASSERT(x)	assert(x)
+#    define STARPU_ASSERT(x)		assert(x)
+#    define STARPU_ASSERT_MSG(x, msg)	do { if (!(x)) { fprintf(stderr, "%s\n", msg); } ; assert(x); } while(0)
+
 #  endif
 #endif
 
 #define STARPU_ABORT()		abort()
 
+#if defined(STARPU_HAVE_STRERROR_R)
+#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err < 0) { \
+			char xmessage[256]; strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d:%s>\n", message, err, xmessage); \
+			STARPU_ASSERT(0); }}
+#  define STARPU_CHECK_RETURN_VALUE_IS(err, value, message) {if (err != value) { \
+			char xmessage[256]; strerror_r(-err, xmessage, 256); \
+			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d:%s>\n", message, err, xmessage); \
+			STARPU_ASSERT(0); }}
+#else
+#  define STARPU_CHECK_RETURN_VALUE(err, message) {if (err < 0) {		\
+			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d>\n", message, err); \
+			STARPU_ASSERT(0); }}
+#  define STARPU_CHECK_RETURN_VALUE_IS(err, value, message) {if (err != value) { \
+			fprintf(stderr, "StarPU function <%s> returned unexpected value: <%d>\n", message, err); \
+			STARPU_ASSERT(0); }}
+#endif /* STARPU_HAVE_STRERROR_R */
+
+/* Return true (non-zero) if GCC version MAJ.MIN or later is being used
+ * (macro taken from glibc.)  */
+#if defined __GNUC__ && defined __GNUC_MINOR__
+# define STARPU_GNUC_PREREQ(maj, min) \
+	((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+#else
+# define STARPU_GNUC_PREREQ(maj, min) 0
+#endif
 
 #ifdef __GNUC__
 #  define STARPU_UNLIKELY(expr)          (__builtin_expect(!!(expr),0))
@@ -60,13 +90,21 @@ extern "C" {
 #  define STARPU_ATTRIBUTE_INTERNAL
 #endif
 
+#if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API)
+#define STARPU_DEPRECATED  __attribute__((__deprecated__))
+#else
+#define STARPU_DEPRECATED
+#endif /* __GNUC__ */
+
 #if defined(__i386__) || defined(__x86_64__)
 
-static __inline unsigned starpu_cmpxchg(unsigned *ptr, unsigned old, unsigned next) {
+static __inline unsigned starpu_cmpxchg(unsigned *ptr, unsigned old, unsigned next)
+{
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
-static __inline unsigned starpu_xchg(unsigned *ptr, unsigned next) {
+static __inline unsigned starpu_xchg(unsigned *ptr, unsigned next)
+{
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgl %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
 	return next;
@@ -75,9 +113,11 @@ static __inline unsigned starpu_xchg(unsigned *ptr, unsigned next) {
 #endif
 
 #define STARPU_ATOMIC_SOMETHING(name,expr) \
-static __inline unsigned starpu_atomic_##name(unsigned *ptr, unsigned value) { \
+static __inline unsigned starpu_atomic_##name(unsigned *ptr, unsigned value) \
+{ \
 	unsigned old, next; \
-	while (1) { \
+	while (1) \
+	{ \
 		old = *ptr; \
 		next = expr; \
 		if (starpu_cmpxchg(ptr, old, next) == old) \
@@ -133,7 +173,8 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #include <starpu_task.h>
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 static __inline int starpu_get_env_number(const char *str)
@@ -141,7 +182,8 @@ static __inline int starpu_get_env_number(const char *str)
 	char *strval;
 
 	strval = getenv(str);
-	if (strval) {
+	if (strval)
+	{
 		/* the env variable was actually set */
 		unsigned val;
 		char *check;
@@ -152,7 +194,8 @@ static __inline int starpu_get_env_number(const char *str)
 		/* fprintf(stderr, "ENV %s WAS %d\n", str, val); */
 		return val;
 	}
-	else {
+	else
+	{
 		/* there is no such env variable */
 		/* fprintf("There was no %s ENV\n", str); */
 		return -1;
@@ -162,50 +205,6 @@ static __inline int starpu_get_env_number(const char *str)
 /* Add an event in the execution trace if FxT is enabled */
 void starpu_trace_user_event(unsigned long code);
 
-#define STARPU_FXT_MAX_FILES	64
-
-struct starpu_fxt_codelet_event {
-	char symbol[256]; /* name of the codelet */
-	int workerid;
-	enum starpu_perf_archtype archtype;
-	uint32_t hash;
-	size_t size;
-	float time;
-};
-
-struct starpu_fxt_options {
-	unsigned per_task_colour;
-	unsigned no_counter;
-	unsigned no_bus;
-	unsigned ninputfiles;
-	char *filenames[STARPU_FXT_MAX_FILES];
-	char *out_paje_path;
-	char *distrib_time_path;
-	char *activity_path;
-	char *dag_path;
-
-	/* In case we are going to gather multiple traces (eg in the case of
-	 * MPI processes), we may need to prefix the name of the containers. */
-	char *file_prefix;
-	uint64_t file_offset;
-	int file_rank;
-
-	/*
-	 *	Output parameters
-	 */
-
-	char worker_names[STARPU_NMAXWORKERS][256]; 
-	enum starpu_perf_archtype worker_archtypes[STARPU_NMAXWORKERS];
-	int nworkers;
-
-	/* In case we want to dump the list of codelets to an external tool */
-	struct starpu_fxt_codelet_event **dumped_codelets;
-	long dumped_codelets_count;
-};
-
-void starpu_fxt_options_init(struct starpu_fxt_options *options);
-void starpu_fxt_generate_trace(struct starpu_fxt_options *options);
-
 /* Some helper functions for application using CUBLAS kernels */
 void starpu_helper_cublas_init(void);
 void starpu_helper_cublas_shutdown(void);
@@ -229,28 +228,28 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
  * (if enabled) or by calling starpu_task_wait_for_all(). If callback_func is
  * not NULL, this callback function is executed after the handle has been
  * copied, and it is given the callback_arg pointer as argument.*/
-int starpu_data_cpy(starpu_data_handle dst_handle, starpu_data_handle src_handle,
-			int asynchronous, void (*callback_func)(void*), void *callback_arg);
+int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
 /* Constants used by the starpu_insert_task helper to determine the different types of argument */
 #define STARPU_VALUE		(1<<4)	/* Pointer to a constant value */
 #define STARPU_CALLBACK		(1<<5)	/* Callback function */
-#define STARPU_CALLBACK_ARG	(1<<6)	/* Argument of the callback function (of type void *) */
-#define STARPU_PRIORITY		(1<<7)	/* Priority associated to the task */
-#define STARPU_EXECUTE_ON_NODE	(1<<8)	/* Used by MPI to define which task is going to execute the codelet */
-#define STARPU_EXECUTE_ON_DATA	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
-#define STARPU_HYPERVISOR_TAG	(1<<10)	/* Used to checkpoint a task after whose execution we'll execute  a code */
+#define STARPU_CALLBACK_WITH_ARG	(1<<6)	/* Callback function */
+#define STARPU_CALLBACK_ARG	(1<<7)	/* Argument of the callback function (of type void *) */
+#define STARPU_PRIORITY		(1<<8)	/* Priority associated to the task */
+#define STARPU_EXECUTE_ON_NODE	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
+#define STARPU_EXECUTE_ON_DATA	(1<<10)	/* Used by MPI to define which task is going to execute the codelet */
+#define STARPU_HYPERVISOR_TAG	(1<<11)	/* Used to checkpoint a task after whose execution we'll execute  a code */
 
 /* Wrapper to create a task. */
-int starpu_insert_task(starpu_codelet *cl, ...);
+int starpu_insert_task(struct starpu_codelet *cl, ...);
 
 /* Retrieve the arguments of type STARPU_VALUE associated to a task
  * automatically created using starpu_insert_task. */
-void starpu_unpack_cl_args(void *cl_arg, ...);
+void starpu_codelet_unpack_args(void *cl_arg, ...);
 
 /* Pack arguments of type STARPU_VALUE into a buffer which can be
- * given to a codelet and later unpacked with starpu_unpack_cl_args */
-void starpu_pack_cl_args(char **arg_buffer, size_t *arg_buffer_size, ...);
+ * given to a codelet and later unpacked with starpu_codelet_unpack_args */
+void starpu_codelet_pack_args(char **arg_buffer, size_t *arg_buffer_size, ...);
 
 #ifdef __cplusplus
 }

+ 95 - 0
m4/acinclude.m4

@@ -0,0 +1,95 @@
+dnl Copyright (C) Free Software Foundation, Inc.
+dnl
+dnl This program is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU General Public License as published by
+dnl the Free Software Foundation; either version 2 of the License, or
+dnl (at your option) any later version.
+dnl 
+dnl This program is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+dnl GNU General Public License for more details.
+dnl 
+dnl You should have received a copy of the GNU General Public License
+dnl along with this program; if not, write to the Free Software
+dnl Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+dnl
+dnl This test is taken from libgfortran
+
+dnl Check whether the target supports __sync_val_compare_and_swap.
+AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
+  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
+		 ac_cv_have_sync_val_compare_and_swap, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
+			[ac_cv_have_sync_val_compare_and_swap=yes],
+			[ac_cv_have_sync_val_compare_and_swap=no])])
+  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
+	      [Define to 1 if the target supports __sync_val_compare_and_swap])
+  fi])
+
+dnl Check whether the target supports __sync_bool_compare_and_swap.
+AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
+  AC_CACHE_CHECK([whether the target supports __sync_bool_compare_and_swap],
+		 ac_cv_have_sync_bool_compare_and_swap, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_bool_compare_and_swap(&foo, 0, 1);])],
+			[ac_cv_have_sync_bool_compare_and_swap=yes],
+			[ac_cv_have_sync_bool_compare_and_swap=no])])
+  if test $ac_cv_have_sync_bool_compare_and_swap = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_BOOL_COMPARE_AND_SWAP, 1,
+	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
+  fi])
+
+dnl Check whether the target supports __sync_fetch_and_add.
+AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
+  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
+		 ac_cv_have_sync_fetch_and_add, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_fetch_and_add(&foo, 1);])],
+			[ac_cv_have_sync_fetch_and_add=yes],
+			[ac_cv_have_sync_fetch_and_add=no])])
+  if test $ac_cv_have_sync_fetch_and_add = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_ADD, 1,
+	      [Define to 1 if the target supports __sync_fetch_and_add])
+  fi])
+
+dnl Check whether the target supports __sync_fetch_and_or.
+AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_OR], [
+  AC_CACHE_CHECK([whether the target supports __sync_fetch_and_or],
+		 ac_cv_have_sync_fetch_and_or, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_fetch_and_or(&foo, 1);])],
+			[ac_cv_have_sync_fetch_and_or=yes],
+			[ac_cv_have_sync_fetch_and_or=no])])
+  if test $ac_cv_have_sync_fetch_and_or = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_FETCH_AND_OR, 1,
+	      [Define to 1 if the target supports __sync_fetch_and_or])
+  fi])
+
+dnl Check whether the target supports __sync_lock_test_and_set.
+AC_DEFUN([STARPU_CHECK_SYNC_LOCK_TEST_AND_SET], [
+  AC_CACHE_CHECK([whether the target supports __sync_lock_test_and_set],
+		 ac_cv_have_sync_lock_test_and_set, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __sync_lock_test_and_set(&foo, 1);])],
+			[ac_cv_have_sync_lock_test_and_set=yes],
+			[ac_cv_have_sync_lock_test_and_set=no])])
+  if test $ac_cv_have_sync_lock_test_and_set = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_LOCK_TEST_AND_SET, 1,
+	      [Define to 1 if the target supports __sync_lock_test_and_set])
+  fi])
+
+dnl Check whether the target supports __sync_synchronize.
+AC_DEFUN([STARPU_CHECK_SYNC_SYNCHRONIZE], [
+  AC_CACHE_CHECK([whether the target supports __sync_synchronize],
+		 ac_cv_have_sync_synchronize, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM(,
+			[__sync_synchronize();])],
+			[ac_cv_have_sync_synchronize=yes],
+			[ac_cv_have_sync_synchronize=no])])
+  if test $ac_cv_have_sync_synchronize = yes; then
+    AC_DEFINE(STARPU_HAVE_SYNC_SYNCHRONIZE, 1,
+	      [Define to 1 if the target supports __sync_synchronize])
+  fi])

+ 33 - 3
m4/gcc.m4

@@ -1,6 +1,6 @@
 dnl -*- Autoconf -*-
 dnl
-dnl Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+dnl Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
 dnl
 dnl StarPU is free software; you can redistribute it and/or modify
 dnl it under the terms of the GNU Lesser General Public License as published by
@@ -30,12 +30,29 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
   AC_REQUIRE([AC_PROG_CC])
   AC_CACHE_CHECK([whether GCC supports plug-ins], [ac_cv_have_gcc_plugins], [
     if test "x$GCC" = xyes; then
+      # ICC 12.1.0 and Clang 3.1 (among others) support `--version',
+      # define `__GNUC__', and provide a `-print-file-name=plugin'
+      # that returns GCC's valid header directory.  This makes them
+      # hardly distinguishable from GCC.  Actually, ICC 12.1.0 is able
+      # to compile our plug-in, but silently ignores `-fplugin', leading
+      # to obvious build failures; thus, it is explicitly excluded below.
       _STARPU_WITH_GCC_PLUGIN_API([
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <gcc-plugin.h>
 	      #include <tree.h>
 	      #include <gimple.h>
+
+	      #if defined __INTEL_COMPILER || defined __ICC
+	      Beware, this compiler is a fake.  Don't use it.
+	      #endif
+
 	      tree fndecl; gimple call;]],
-	    [[fndecl = lookup_name (get_identifier ("puts"));
+	    [[/* Clang 3.1 doesn't support nested functions, so try to
+	         discriminate it this way.  */
+	      tree foo (void)
+	      {
+	        return lookup_name (get_identifier ("puts"));
+              }
+	      fndecl = foo ();
 	      call = gimple_build_call (fndecl, 0);]])],
 	  [ac_cv_have_gcc_plugins="yes"],
 	  [ac_cv_have_gcc_plugins="no"])
@@ -52,9 +69,10 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
     dnl   build_call_expr_loc_array -- not in GCC 4.5.x; appears in 4.6
     dnl   build_call_expr_loc_vec   -- likewise
     dnl   build_array_ref           -- present but undeclared in 4.6.1
+    dnl   build_zero_cst            -- not in GCC 4.5.x; appears in 4.6
     _STARPU_WITH_GCC_PLUGIN_API([
       AC_CHECK_DECLS([build_call_expr_loc_array, build_call_expr_loc_vec,
-                      build_array_ref],
+                      build_array_ref, build_zero_cst],
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
 
@@ -63,8 +81,20 @@ AC_DEFUN([STARPU_GCC_PLUGIN_SUPPORT], [
       AC_CHECK_HEADERS([c-common.h c-pragma.h c-family/c-common.h c-family/c-pragma.h],
         [], [], [#include <gcc-plugin.h>
 	         #include <tree.h>])
+
+      AC_DEFINE_UNQUOTED([STARPU_INCLUDE_DIR],
+        ["`eval "echo $includedir"`/starpu/$STARPU_EFFECTIVE_VERSION"],
+        [Define to the directory where StarPU's headers are installed.])
     ])
   fi
 
   AC_SUBST([GCC_PLUGIN_INCLUDE_DIR])
 ])
+
+dnl Substitute `STARPU_GCC_VERSION_MAJOR' and `STARPU_GCC_VERSION_MINOR'.
+AC_DEFUN([STARPU_GCC_VERSION], [
+  AC_COMPUTE_INT([STARPU_GCC_VERSION_MAJOR], [__GNUC__])
+  AC_COMPUTE_INT([STARPU_GCC_VERSION_MINOR], [__GNUC_MINOR__])
+  AC_SUBST([STARPU_GCC_VERSION_MAJOR])
+  AC_SUBST([STARPU_GCC_VERSION_MINOR])
+])

+ 0 - 214
magma_tests/time_zpotrf_tile.c

@@ -1,214 +0,0 @@
-/**
- *
- * @precisions normal z -> c d s
- *
- **/
-#define _TYPE  PLASMA_Complex64_t
-#define _PREC  double
-#define _LAMCH LAPACKE_dlamch_work
-
-#define _NAME  "PLASMA_zpotrf_Tile"
-/* See Lawn 41 page 120 */
-#define _FMULS (n * (1.0 / 6.0 * n + 0.5) * n)
-#define _FADDS (n * (1.0 / 6.0 * n )      * n)
-
-#include "./timing.c"
-
-int first = 1;
-pthread_mutex_t mut;
-void* start_Test(void *p)
-{
-	PLASMA_enum uplo = ((params*)p)->uplo;
-	magma_desc_t *descA = ((params*)p)->descA;
-
-	unsigned ctx = ((params*)p)->ctx;
-	unsigned the_other_ctx = ((params*)p)->the_other_ctx;
-
-	if(ctx != 0)
-		starpu_set_sched_ctx(&ctx);
-
-	if(ctx == 1)
-	{
-		int i, j;
-		int sum = 0;
-		for(i = 0; i < 1000; i++)
-			for(j = 0; j < 100; j++)
-			{
-				sum += i;
-				printf("sum = %d\n", sum);
-			}
-	}
-	real_Double_t t;
-	((params*)p)->t = -cWtime();
-	MAGMA_zpotrf_Tile(uplo, descA);
-	((params*)p)->t += cWtime();
-
-	printf("require stop resize\n");
-	sched_ctx_hypervisor_stop_resize(the_other_ctx);
-/* 	if(ctx != 0) */
-/*         { */
-/*                 pthread_mutex_lock(&mut); */
-/*                 if(first){ */
-/*                         starpu_delete_sched_ctx(ctx, the_other_ctx); */
-/*                 } */
-
-/*                 first = 0; */
-/*                 pthread_mutex_unlock(&mut); */
-/*         } */
-
-
-	return p;
-}
-
-
-static magma_desc_t* do_start_stuff(int *iparam, int n, PLASMA_Complex64_t *A, PLASMA_Complex64_t *AT) 
-{
-    PLASMA_Complex64_t *b, *bT, *x;
-    real_Double_t       t;
-    magma_desc_t       *descA = NULL;
-    int nb, nt;
-    int nrhs  = iparam[TIMING_NRHS];
-    int check = iparam[TIMING_CHECK];
-    int nocpu = iparam[TIMING_NO_CPU];
-    int lda = n;
-    int ldb = n;
-
-    int peak_profiling = iparam[TIMING_PEAK];
-    int profiling      = iparam[TIMING_PROFILE];
-
-    nb  = iparam[TIMING_NB];
-    nt  = n / nb + ((n % nb == 0) ? 0 : 1);
-    
-    /* Allocate Data */
-    AT = (PLASMA_Complex64_t *)malloc(lda*n*sizeof(PLASMA_Complex64_t));
-
-    /* Check if unable to allocate memory */
-    if ( !AT ){
-        printf("Out of Memory \n ");
-        exit(0);
-    }
-
-    /* Initialiaze Data */
-    MAGMA_Desc_Create(&descA, AT, PlasmaComplexDouble, nb, nb, nb*nb, lda, n, 0, 0, n, n);
-    MAGMA_zplghe_Tile((double)n, descA, 51 );
-
-    /* Save AT in lapack layout for check */
-    if ( check ) {
-        A = (PLASMA_Complex64_t *)malloc(lda*n    *sizeof(PLASMA_Complex64_t));
-        MAGMA_zTile_to_Lapack( descA, (void*)A, n);
-    }
-
-    if ( profiling | peak_profiling )
-        MAGMA_Enable( MAGMA_PROFILING_MODE );
-
-    if (nocpu)
-        morse_zlocality_allrestrict( MAGMA_CUDA );
-    return descA;
-
-}
-
-static void do_end_stuff(int *iparam, double *dparam, magma_desc_t *descA, int n, PLASMA_enum uplo,
-	PLASMA_Complex64_t *A, PLASMA_Complex64_t *AT)
-{
-    PLASMA_Complex64_t *b, *bT, *x;
-    real_Double_t       t;
-    magma_desc_t       *descB = NULL;
-    int nb, nt;
-    int nrhs  = iparam[TIMING_NRHS];
-    int check = iparam[TIMING_CHECK];
-    int nocpu = iparam[TIMING_NO_CPU];
-    int lda = n;
-    int ldb = n;
-
-    int peak_profiling = iparam[TIMING_PEAK];
-    int profiling      = iparam[TIMING_PROFILE];
-
-    if (nocpu)
-        morse_zlocality_allrestore();
-
-    if ( profiling | peak_profiling )
-        MAGMA_Disable( MAGMA_PROFILING_MODE );
-
-    /* Check the solution */
-    if ( check )
-      {
-        b  = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
-        bT = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
-        x  = (PLASMA_Complex64_t *)malloc(ldb*nrhs*sizeof(PLASMA_Complex64_t));
-
-        LAPACKE_zlarnv_work(1, ISEED, ldb*nrhs, bT);
-        MAGMA_Desc_Create(&descB, bT, PlasmaComplexDouble, nb, nb, nb*nb, ldb, nrhs, 0, 0, n, nrhs);
-        MAGMA_zTile_to_Lapack(descB, (void*)b, n);
-
-        MAGMA_zpotrs_Tile( uplo, descA, descB);
-        MAGMA_zTile_to_Lapack(descB, (void*)x, n);
-
-        dparam[TIMING_RES] = zcheck_solution(n, n, nrhs, A, lda, b, x, ldb,
-                                             &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), 
-                                             &(dparam[TIMING_XNORM]));
-        MAGMA_Desc_Destroy(&descB);
-        free( A );
-        free( b );
-        free( bT );
-        free( x );
-      }
-
-    MAGMA_Desc_Destroy(&descA);
-    free(AT);
-
-    if (peak_profiling) {
-        real_Double_t peak = 0;
-        /*estimate_zgemm_sustained_peak(&peak);*/
-        dparam[TIMING_ESTIMATED_PEAK] = (double)peak;
-    }
-    
-    if (profiling)
-    {
-        /* Profiling of the scheduler */
-        morse_schedprofile_display();
-        /* Profile of each kernel */
-        morse_zdisplay_allprofile();
-    }
-}
-
-static int
-RunTest(int *iparam, double *dparam, real_Double_t *t_) 
-{
-	PLASMA_Complex64_t *A1, *AT1, *A2, *AT2;
-	int n1     = iparam[TIMING_N];
-	int n2     = iparam[TIMING_N2];
-	magma_desc_t       *descA1 = NULL;
-	magma_desc_t       *descA2 = NULL;
-	PLASMA_enum uplo1 = PlasmaLower;
-	PLASMA_enum uplo2 = PlasmaLower;
-	
-	descA1 = do_start_stuff(iparam, n1, A1, AT1);
-	descA2 = do_start_stuff(iparam, n2, A2, AT2);
-	
-	pthread_t tid[2];
-
-	p1.uplo = uplo1;
-	p1.descA = descA1;
-
-	p2.uplo = uplo2;
-	p2.descA = descA2;
-
-        pthread_mutex_init(&mut, NULL);
-
-	pthread_create(&tid[0], NULL, (void*)start_Test, (void*)&p1);
-	pthread_create(&tid[1], NULL, (void*)start_Test, (void*)&p2);
-
-	pthread_join(tid[0], &p1);
-	pthread_join(tid[1], &p2);
-
-	pthread_mutex_destroy(&mut);
-
-	t1[0] = p1.t;
-	t2[0] = p2.t;
-
-        printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
-
-	do_end_stuff(iparam, dparam1, descA1, n1, uplo1, A1, AT1);
-	do_end_stuff(iparam, dparam2, descA2, n2, uplo2, A2, AT2);
-    return 0;
-}

+ 0 - 747
magma_tests/timing.c

@@ -1,747 +0,0 @@
-/**
- *
- * @file time_main.c
- *
- *  PLASMA auxiliary routines
- *  PLASMA is a software package provided by Univ. of Tennessee,
- *  Univ. of California Berkeley and Univ. of Colorado Denver
- *
- * @version 2.3.1
- * @author ???
- * @author Mathieu Faverge
- * @date 2010-11-15
- *
- **/
-
-/* Define these so that the Microsoft VC compiler stops complaining
-   about scanf and friends */
-#define _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_WARNINGS
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef PLASMA_EZTRACE
-#include <eztrace.h>
-#endif
-
-#if defined( _WIN32 ) || defined( _WIN64 )
-#include <windows.h>
-#include <time.h>
-#include <sys/timeb.h>
-#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
-#define DELTA_EPOCH_IN_MICROSECS  11644473600000000Ui64
-#else
-#define DELTA_EPOCH_IN_MICROSECS  11644473600000000ULL
-#endif
-
-struct timezone
-{
-    int  tz_minuteswest; /* minutes W of Greenwich */
-    int  tz_dsttime;     /* type of dst correction */
-};
-
-int gettimeofday(struct timeval* tv, struct timezone* tz)
-{
-    FILETIME         ft;
-    unsigned __int64 tmpres = 0;
-    static int       tzflag;
-
-    if (NULL != tv)
-        {
-            GetSystemTimeAsFileTime(&ft);
-            tmpres |=  ft.dwHighDateTime;
-            tmpres <<= 32;
-            tmpres |=  ft.dwLowDateTime;
-
-            /*converting file time to unix epoch*/
-            tmpres /= 10;  /*convert into microseconds*/
-            tmpres -= DELTA_EPOCH_IN_MICROSECS;
-
-            tv->tv_sec  = (long)(tmpres / 1000000UL);
-            tv->tv_usec = (long)(tmpres % 1000000UL);
-        }
-    if (NULL != tz)
-        {
-            if (!tzflag)
-                {
-                    _tzset();
-                    tzflag++;
-                }
-            tz->tz_minuteswest = _timezone / 60;
-            tz->tz_dsttime     = _daylight;
-        }
-    return 0;
-}
-
-#else  /* Non-Windows */
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#endif
-
-#include <cblas.h>
-#include <lapacke.h>
-#include <plasma.h>
-#include <core_blas.h>
-#include <magma_morse.h>
-#include <sched_ctx_hypervisor.h>
-#include "timing.h"
-#include "auxiliary.h"
-#include <pthread.h>
-
-static int RunTest(int *iparam, _PREC *dparam, double *t_);
-
-double cWtime(void);
-
-int ISEED[4] = {0,0,0,1};   /* initial seed for zlarnv() */
-
-/*
- * struct timeval {time_t tv_sec; suseconds_t tv_usec;};
- */
-double cWtime(void)
-{
-    struct timeval tp;
-    gettimeofday( &tp, NULL );
-    return tp.tv_sec + 1e-6 * tp.tv_usec;
-}
-
-double       *t1, *t2;
-_PREC         dparam1[TIMING_DNBPARAM];
-_PREC         dparam2[TIMING_DNBPARAM];
-
-static int
-Test(int64_t n, int *iparam) {
-    int           i, j, iter, m;
-    int thrdnbr, niter, nrhs;
-    double       *t;
-    _PREC         eps = _LAMCH( 'e' );
-    _PREC         dparam[TIMING_DNBPARAM];
-    double        flops, fmuls, fadds, fp_per_mul, fp_per_add;
-    double        sumgf, sumgf2, sumt, sd, gflops;
-    double        flops_2, fmuls_2, fadds_2;
-    double        sumgf_2, sumgf2_2, sumt_2, sd_2, gflops_2;
-    
-    char         *s;
-    char         *env[] = {
-        "OMP_NUM_THREADS",
-        "MKL_NUM_THREADS",
-        "GOTO_NUM_THREADS",
-        "ACML_NUM_THREADS",
-        "ATLAS_NUM_THREADS",
-        "BLAS_NUM_THREADS", ""
-    };
-    int gnuplot = 0;
-
-    thrdnbr = iparam[TIMING_THRDNBR];
-    niter   = iparam[TIMING_NITER];
-    nrhs    = iparam[TIMING_NRHS];
-
-    if (n < 0 || thrdnbr < 0) {
-        const char *bound_header = iparam[TIMING_BOUND] ? " thGflop/s" : "";
-        const char *check_header = iparam[TIMING_CHECK] ? "   ||Ax-b||       ||A||       ||x||       ||b||         eps ||Ax-b||/N/eps/(||A||||x||+||b||)" : "";
-        const char *peak_header = iparam[TIMING_PEAK] ? "    (\% of peak)  peak" : "";
-
-        printf( "#   N NRHS threads seconds   Gflop/s Deviation        %s%s%s\n", bound_header, peak_header, check_header);
-
-        if (gnuplot) {
-            printf( "set title '%d_NUM_THREADS: ", thrdnbr );
-            for (i = 0; env[i][0]; ++i) {
-                s = getenv( env[i] );
-
-                if (i) printf( " " ); /* separating space */
-
-                for (j = 0; j < 5 && env[i][j] && env[i][j] != '_'; ++j)
-                    printf( "%c", env[i][j] );
-
-                if (s)
-                    printf( "=%s", s );
-                else
-                    printf( "->%s", "?" );
-            }
-            printf( "'\n" );
-            printf( "%s\n%s\n%s\n%s\n%s%s%s\n",
-                    "set xlabel 'Matrix size'",
-                    "set ylabel 'Gflop/s'",
-                    "set key bottom",
-                    gnuplot > 1 ? "set terminal png giant\nset output 'timeplot.png'" : "",
-                    "plot '-' using 1:5 title '", _NAME, "' with linespoints" );
-        }
-
-        return 0;
-    }
-
-    printf( "%5d %4d %5d ", iparam[TIMING_N], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
-    printf( "%5d %4d %5d ", iparam[TIMING_N2], iparam[TIMING_NRHS], iparam[TIMING_THRDNBR] );
-    fflush( stdout );
-
-    t = (double*)malloc(niter*sizeof(double));
-    memset(t, 0, niter*sizeof(double));
-
-    t1 = (double*)malloc(niter*sizeof(double));
-    memset(t, 0, niter*sizeof(double));
-
-    t2 = (double*)malloc(niter*sizeof(double));
-    memset(t, 0, niter*sizeof(double));
-
-    if (sizeof(_TYPE) == sizeof(_PREC)) {
-        fp_per_mul = 1;
-        fp_per_add = 1;
-    } else {
-        fp_per_mul = 6;
-        fp_per_add = 2;
-    }
-
-    m = iparam[TIMING_M];
-    n = iparam[TIMING_N];
-    fadds = _FADDS;
-    fmuls = _FMULS;
-    flops = fmuls * fp_per_mul + fadds * fp_per_add;
-    gflops = 0.0;
-
-    m = iparam[TIMING_M2];
-    n = iparam[TIMING_N2];
-    fadds_2 = _FADDS;
-    fmuls_2 = _FMULS;
-    flops_2 = fmuls_2 * fp_per_mul + fadds_2 * fp_per_add;
-    gflops_2 = 0.0;
-
-    if ( iparam[TIMING_WARMUP] ) {
-        RunTest( iparam, dparam, &(t[0]));
-    }
-
-    sumgf  = 0.0;
-    double sumgf_upper  = 0.0;
-    sumgf2 = 0.0;
-    sumt   = 0.0;
-
-    sumgf_2  = 0.0;
-    double sumgf_upper_2  = 0.0;
-    sumgf2_2 = 0.0;
-    sumt_2   = 0.0;
-    
-    for (iter = 0; iter < niter; iter++)
-    {
-
-#ifdef PLASMA_EZTRACE
-        if( iter == 0 ) {
-            eztrace_start();
-            RunTest( iparam, dparam, &(t[iter]));
-            eztrace_stop();
-        }
-        else
-#endif
-            RunTest( iparam, dparam, &(t[iter]));
-
-        double tmin = 0.0;
-        double integer_tmin = 0.0;
-        double upper_gflops = 0.0;
-
-        double tmin_2 = 0.0;
-        double integer_tmin_2 = 0.0;
-        double upper_gflops_2 = 0.0;
-
-#if 0
-        if (iparam[TIMING_BOUND])
-        {
-            if (iparam[TIMING_BOUNDDEPS]) {
-                FILE *out = fopen("bounddeps.pl", "w");
-                starpu_bound_print_lp(out);
-                fclose(out);
-                out = fopen("bound.dot", "w");
-                starpu_bound_print_dot(out);
-                fclose(out);
-            } else {
-#if 0
-                FILE *out = fopen("bound.pl", "w");
-                starpu_bound_print_lp(out);
-                fclose(out);
-#endif
-                starpu_bound_compute(&tmin, &integer_tmin, 0);
-                upper_gflops  = ((1e-6 * flops) / tmin);
-                starpu_bound_compute(&tmin_2, &integer_tmin_2, 0);
-                upper_gflops_2  = ((1e-6 * flops_2) / tmin_2);
-
-            }
-        }
-#endif
-	printf("t1 = %lf t2 = %lf \n", t1[0], t2[0]);
-        gflops  = (1e-9 * flops) / t1[iter];
-        sumt   += t1[iter];
-        sumgf_upper += upper_gflops;
-        sumgf  += gflops;
-        sumgf2 += gflops*gflops;
-
-        gflops_2  = (1e-9 * flops_2) / t2[iter];
-        sumt_2   += t2[iter];
-        sumgf_upper_2 += upper_gflops_2;
-        sumgf_2  += gflops_2;
-        sumgf2_2 += gflops_2*gflops_2;
-
-    }
-
-    gflops = sumgf / niter;
-    sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
-
-    gflops_2 = sumgf_2 / niter;
-    sd_2 = sqrt((sumgf2_2 - (sumgf_2*sumgf_2)/niter)/niter);
-
-    printf( "%9.3f %9.2f +-%7.2f  ", sumt/niter, gflops, sd);
-
-    if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
-        printf(" %9.2f",  sumgf_upper/niter);
-
-    if ( iparam[TIMING_PEAK] )
-    {
-       if (dparam1[TIMING_ESTIMATED_PEAK]<0.0f)
-         printf("  n/a    n/a   ");
-       else
-         printf("  %2.2f\%%  %9.2f ", 100.0f*(gflops/dparam1[TIMING_ESTIMATED_PEAK]), dparam1[TIMING_ESTIMATED_PEAK]);
-    }
-
-    if ( iparam[TIMING_CHECK] )
-        printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
-                dparam1[TIMING_RES], dparam1[TIMING_ANORM], dparam1[TIMING_XNORM], dparam1[TIMING_BNORM], eps, 
-                dparam1[TIMING_RES] / n / eps / (dparam1[TIMING_ANORM] * dparam1[TIMING_XNORM] + dparam1[TIMING_BNORM] ));
-    printf("\n");
-
-    printf( "%9.3f %9.2f +-%7.2f  ", sumt_2/niter, gflops_2, sd_2);
-
-    if (iparam[TIMING_BOUND] && !iparam[TIMING_BOUNDDEPS])
-        printf(" %9.2f",  sumgf_upper_2/niter);
-
-    if ( iparam[TIMING_PEAK] )
-    {
-       if (dparam2[TIMING_ESTIMATED_PEAK]<0.0f)
-         printf("  n/a    n/a   ");
-       else
-         printf("  %2.2f\%%  %9.2f ", 100.0f*(gflops_2/dparam2[TIMING_ESTIMATED_PEAK]), dparam2[TIMING_ESTIMATED_PEAK]);
-    }
-
-    if ( iparam[TIMING_CHECK] )
-        printf( "%8.5e %8.5e %8.5e %8.5e %8.5e %8.5e",
-                dparam2[TIMING_RES], dparam2[TIMING_ANORM], dparam2[TIMING_XNORM], dparam2[TIMING_BNORM], eps, 
-                dparam2[TIMING_RES] / n / eps / (dparam2[TIMING_ANORM] * dparam2[TIMING_XNORM] + dparam2[TIMING_BNORM] ));
-    printf("\n");
-
-    fflush( stdout );
-    free(t);
-    free(t1);
-    free(t2);
-
-    return 0;
-}
-
-static int
-startswith(const char *s, const char *prefix) {
-    size_t n = strlen( prefix );
-    if (strncmp( s, prefix, n ))
-        return 0;
-    return 1;
-}
-
-static int
-get_range(char *range, int *start_p, int *stop_p, int *step_p) {
-    char *s, *s1, buf[21];
-    int colon_count, copy_len, nbuf=20, n;
-    int start=1000, stop=10000, step=1000;
-
-    colon_count = 0;
-    for (s = strchr( range, ':'); s; s = strchr( s+1, ':'))
-        colon_count++;
-
-    if (colon_count == 0) { /* No colon in range. */
-        if (sscanf( range, "%d", &start ) < 1 || start < 1)
-            return -1;
-        step = start / 10;
-        if (step < 1) step = 1;
-        stop = start + 10 * step;
-
-    } else if (colon_count == 1) { /* One colon in range.*/
-        /* First, get the second number (after colon): the stop value. */
-        s = strchr( range, ':' );
-        if (sscanf( s+1, "%d", &stop ) < 1 || stop < 1)
-            return -1;
-
-        /* Next, get the first number (before colon): the start value. */
-        n = s - range;
-        copy_len = n > nbuf ? nbuf : n;
-        strncpy( buf, range, copy_len );
-        buf[copy_len] = 0;
-        if (sscanf( buf, "%d", &start ) < 1 || start > stop || start < 1)
-            return -1;
-
-        /* Let's have 10 steps or less. */
-        step = (stop - start) / 10;
-        if (step < 1)
-            step = 1;
-    } else if (colon_count == 2) { /* Two colons in range. */
-        /* First, get the first number (before the first colon): the start value. */
-        s = strchr( range, ':' );
-        n = s - range;
-        copy_len = n > nbuf ? nbuf : n;
-        strncpy( buf, range, copy_len );
-        buf[copy_len] = 0;
-	if(copy_len == 0)
-	  start = 0;
-        else if (sscanf( buf, "%d", &start ) < 1 || start < 1)
-            return -1;
-
-        /* Next, get the second number (after the first colon): the stop value. */
-        s1 = strchr( s+1, ':' );
-        n = s1 - (s + 1);
-        copy_len = n > nbuf ? nbuf : n;
-        strncpy( buf, s+1, copy_len );
-        buf[copy_len] = 0;
-
-	if(copy_len == 0)
-	  stop = 0;
-        else if (sscanf( buf, "%d", &stop ) < 1 || stop < start)
-            return -1;
-
-        /* Finally, get the third number (after the second colon): the step value. */
-        if (sscanf( s1+1, "%d", &step ) < 1 || step < 1)
-            return -1;
-    } else
-
-        return -1;
-
-    *start_p = start;
-    *stop_p = stop;
-    *step_p = step;
-
-    return 0;
-}
-
-static void
-show_help(char *prog_name) {
-    printf( "Usage:\n%s [options]\n\n", prog_name );
-    printf( "Options are:\n" );
-    printf( "  --threads=C    Number of threads (default: 1)\n" );
-    printf( "  --n_range=R    Range of N values: Start:Stop:Step (default: 500:5000:500)\n" );
-    //    printf( "  --gnuplot      produce output suitable for gnuplot" );
-    printf( "  --[no]check    Check result (default: nocheck)\n" );
-    printf( "  --[no]warmup   Perform a warmup run to pre-load libraries (default: warmup)\n");
-    printf( "  --parallel=N   Use parallel tasks of size N (default: no)\n");
-    printf( "  --niter=N      Number of iterations (default: 1)\n");
-    printf( "  --nb=N         Nb size. Not used if autotuning is activated (default: 128)\n");
-    printf( "  --ib=N         IB size. Not used if autotuning is activated (default: 32)\n");
-    printf( "  --nrhs=N       Number of right-hand size (default: 1)\n");
-    printf( "  --[no]dyn      Activate Dynamic scheduling (default: nodyn)\n");
-    printf( "  --[no]atun     Activate autotuning (default: noatun)\n");
-    printf( "  --ifmt         Input format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 0)\n");
-    printf( "  --ofmt         Output format. 0: CM, 1: CCRB, 2: CRRB, 3: RCRB, 4: RRRB, 5: RM (default: 1)\n");
-    printf( "  --thrdbypb     Number of threads per subproblem for inplace transformation (default: 1)\n");
-    printf( "  --[no]profile  Profile kernels with StarPU (default: no)\n");
-    printf( "  --[no]peak     Evalue sustained peak performance (default: no)\n");
-}
-static void
-get_thread_count(int *thrdnbr) {
-#if defined WIN32 || defined WIN64
-    sscanf( getenv( "NUMBER_OF_PROCESSORS" ), "%d", thrdnbr );
-#else
-    *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
-#endif
-}
-
-typedef struct {
-        PLASMA_enum uplo;
-        magma_desc_t *descA;
-        unsigned ctx;
-        unsigned the_other_ctx;
-	real_Double_t t;
-} params;
-
-double compute_flops(int n, int m)
-{
-	double fp_per_mul, fp_per_add;
-	if (sizeof(_TYPE) == sizeof(_PREC)) {
-		fp_per_mul = 1;
-		fp_per_add = 1;
-	} else {
-		fp_per_mul = 6;
-		fp_per_add = 2;
-	}
-	
-	double fmuls = (n * (1.0 / 6.0 * n + 0.5) * n);
-	double fadds = (n * (1.0 / 6.0 * n ) * n);
-	double flops = fmuls * fp_per_mul + fadds * fp_per_add;
-	return flops;
-}
-params p1, p2;
-int
-main(int argc, char *argv[]) {
-    int i;
-    int start =  500;
-    int stop  = 5000;
-    int step  =  500;
-
-    int start1 =  500;
-    int stop1  = 5000;
-    int step1  =  500;
-
-    int start2 =  500;
-    int stop2  = 5000;
-    int step2  =  500;
-
-    int start_cpus1 =  0, start_cpus2 = 0, start_gpus1 = 0, start_gpus2 = 0;
-    int stop_cpus1  = -1, stop_cpus2  = -1, stop_gpus1 = -1, stop_gpus2 = -1;
-    int step_cpus1  =  1, step_cpus2 = 1, step_gpus1 = 1, step_gpus2 = 1;
-
-    int iparam[TIMING_INBPARAM];
-
-    memset(iparam, 0, TIMING_INBPARAM*sizeof(int));
-
-    iparam[TIMING_CHECK         ] = 0;
-    iparam[TIMING_WARMUP        ] = 1;
-    iparam[TIMING_NITER         ] = 1;
-    iparam[TIMING_N             ] = 500;
-    iparam[TIMING_N2            ] = 500;
-    iparam[TIMING_NB            ] = 128;
-    iparam[TIMING_IB            ] = 32;
-    iparam[TIMING_NRHS          ] = 1;
-    iparam[TIMING_THRDNBR       ] = 1;
-    iparam[TIMING_NCUDAS        ] = 0;
-    iparam[TIMING_THRDNBR_SUBGRP] = 1;
-    iparam[TIMING_SCHEDULER     ] = 0;
-    iparam[TIMING_AUTOTUNING    ] = 1;
-    iparam[TIMING_INPUTFMT      ] = 0;
-    iparam[TIMING_OUTPUTFMT     ] = 0;
-    iparam[TIMING_NDOM          ] = 1;
-    iparam[TIMING_PROFILE       ] = 0;
-    iparam[TIMING_PEAK          ] = 0;
-    iparam[TIMING_PARALLEL_TASKS] = 0;
-    iparam[TIMING_NO_CPU        ] = 0;
-    iparam[TIMING_BOUND         ] = 0;
-    iparam[TIMING_BOUNDDEPS     ] = 0;
-    iparam[TIMING_BOUNDDEPSPRIO ] = 0;
-    iparam[TIMING_WITH_CTXS     ] = 1;
-
-    get_thread_count( &(iparam[TIMING_THRDNBR]) );
-
-    for (i = 1; i < argc && argv[i]; ++i) {
-        if (startswith( argv[i], "--help" )) {
-            show_help( argv[0] );
-            return EXIT_SUCCESS;
-        } else if (startswith( argv[i], "--n_cpus1=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start_cpus1, &stop_cpus1, &step_cpus1 );
-        } else if (startswith( argv[i], "--n_cpus2=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start_cpus2, &stop_cpus2, &step_cpus2 );
-        } else if (startswith( argv[i], "--n_gpus1=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start_gpus1, &stop_gpus1, &step_gpus1 );
-        } else if (startswith( argv[i], "--n_gpus2=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start_gpus2, &stop_gpus2, &step_gpus2 );
-        } else if (startswith( argv[i], "--n_range=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start, &stop, &step );
-        } else if (startswith( argv[i], "--n_range1=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start1, &stop1, &step1 );
-	} else if (startswith( argv[i], "--n_range2=" )) {
-            get_range( strchr( argv[i], '=' ) + 1, &start2, &stop2, &step2 );
-        } else if (startswith( argv[i], "--threads=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR]) );
-        /* } else if (startswith( argv[i], "--gnuplot-png" )) { */
-        /*     gnuplot = 2; */
-        /* } else if (startswith( argv[i], "--gnuplot" )) { */
-        /*     gnuplot = 1; */
-        } else if (startswith( argv[i], "--noctxs" )) {
-            iparam[TIMING_WITH_CTXS] = 0;
-        } else if (startswith( argv[i], "--check" )) {
-            iparam[TIMING_CHECK] = 1;
-        } else if (startswith( argv[i], "--nocheck" )) {
-            iparam[TIMING_CHECK] = 0;
-        } else if (startswith( argv[i], "--warmup" )) {
-            iparam[TIMING_WARMUP] = 1;
-        } else if (startswith( argv[i], "--nowarmup" )) {
-            iparam[TIMING_WARMUP] = 0;
-        } else if (startswith( argv[i], "--dyn" )) {
-            iparam[TIMING_SCHEDULER] = 1;
-        } else if (startswith( argv[i], "--nodyn" )) {
-            iparam[TIMING_SCHEDULER] = 0;
-        } else if (startswith( argv[i], "--atun" )) {
-            iparam[TIMING_AUTOTUNING] = 1;
-        } else if (startswith( argv[i], "--noatun" )) {
-            iparam[TIMING_AUTOTUNING] = 0;
-        } else if (startswith( argv[i], "--profile" )) {
-            iparam[TIMING_PROFILE] = 1;
-        } else if (startswith( argv[i], "--peak" )) {
-            iparam[TIMING_PEAK] = 1;
-        } else if (startswith( argv[i], "--noprofile" )) {
-            iparam[TIMING_PROFILE] = 0;
-        } else if (startswith( argv[i], "--parallel=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_PARALLEL_TASKS]) );
-        } else if (startswith( argv[i], "--noparallel" )) {
-            iparam[TIMING_PARALLEL_TASKS] = 0;
-        } else if (startswith( argv[i], "--nocpu" )) {
-            iparam[TIMING_NO_CPU] = 1;
-        } else if (startswith( argv[i], "--nb=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NB]) );
-        } else if (startswith( argv[i], "--m=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_M]) );
-        } else if (startswith( argv[i], "--ib=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_IB]) );
-        } else if (startswith( argv[i], "--nrhs=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_NRHS]) );
-        } else if (startswith( argv[i], "--ifmt=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_INPUTFMT]) );
-        } else if (startswith( argv[i], "--ofmt=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_OUTPUTFMT]) );
-        } else if (startswith( argv[i], "--thrdbypb=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &(iparam[TIMING_THRDNBR_SUBGRP]) );
-        } else if (startswith( argv[i], "--niter=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NITER] );
-        } else if (startswith( argv[i], "--ndom=" )) {
-            sscanf( strchr( argv[i], '=' ) + 1, "%d", &iparam[TIMING_NDOM] );
-        } else if (startswith( argv[i], "--bounddepsprio" )) {
-                iparam[TIMING_BOUND] = 1;
-                iparam[TIMING_BOUNDDEPS] = 1;
-                iparam[TIMING_BOUNDDEPSPRIO] = 1;
-        } else if (startswith( argv[i], "--bounddeps" )) {
-                iparam[TIMING_BOUND] = 1;
-                iparam[TIMING_BOUNDDEPS] = 1;
-        } else if (startswith( argv[i], "--bound" )) {
-                iparam[TIMING_BOUND] = 1;
-        } else {
-            fprintf( stderr, "Unknown option: %s\n", argv[i] );
-        }
-    }
-    if (step < 1) step = 1;
-    if (step1 < 1) step1 = 1;
-    if (step2 < 1) step2 = 1;
-
-    /* TODO : correct into plasma */
-    if ( iparam[TIMING_IB] > iparam[TIMING_NB] )
-      iparam[TIMING_IB] = iparam[TIMING_NB];
-
-    /* TODO */
-    if (iparam[TIMING_PARALLEL_TASKS]) {
-        MAGMA_InitPar(iparam[TIMING_THRDNBR]/iparam[TIMING_PARALLEL_TASKS], 
-                      iparam[TIMING_NCUDAS],
-                      iparam[TIMING_PARALLEL_TASKS]);
-    }
-    else {
-        MAGMA_Init( iparam[TIMING_THRDNBR],
-                    iparam[TIMING_NCUDAS]);
-        
-    }
-
-    MAGMA_Disable(MAGMA_AUTOTUNING);
-    MAGMA_Set(MAGMA_TILE_SIZE,        iparam[TIMING_NB] );
-    MAGMA_Set(MAGMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
-
-    if(iparam[TIMING_WITH_CTXS])
-    {
-	    int nprocs1 = (stop_cpus1 - start_cpus1 + 1)/step_cpus1 + (stop_gpus1 - start_gpus1 + 1)/step_gpus1;
-	    int nprocs2 = (stop_cpus2 - start_cpus2 + 1)/step_cpus2 + (stop_gpus2 - start_gpus2 + 1)/step_gpus2;
-	    int procs1[nprocs1];
-	    int procs2[nprocs2];
-	    int i, j = 0;
-	    printf("%d: ", nprocs1);
-	    for (i = start_gpus1; i <= stop_gpus1; i += step_gpus1)
-	    {
-		    printf("%d ", i);
-		    procs1[j++] = i;
-	    }
-	    
-	    for (i = start_cpus1; i <= stop_cpus1; i += step_cpus1)
-	    {
-		    printf("%d ", i);
-		    procs1[j++] = i;
-	    }
-	    printf("\n");
-	    
-	    printf("%d: ", nprocs2);
-	    j = 0;
-	    for (i = start_gpus2; i <= stop_gpus2; i += step_gpus2)
-	    {
-		    printf("%d ", i);
-		    procs2[j++] = i;
-	    }
-	    
-	    for (i = start_cpus2; i <= stop_cpus2; i += step_cpus2)
-	    {
-		    printf("%d ", i);
-		    procs2[j++] = i;
-	    }
-	    printf("\n");
-	    
-	    struct starpu_sched_ctx_hypervisor_criteria *criteria = sched_ctx_hypervisor_init(SIMPLE_POLICY);
-	    p1.ctx = starpu_create_sched_ctx_with_criteria("heft", procs1, nprocs1, "sched_ctx1", &criteria);
-	    
-	    p2.ctx = starpu_create_sched_ctx_with_criteria("heft", procs2, nprocs2, "sched_ctx2", &criteria);
-
-/* 	    p1.ctx = starpu_create_sched_ctx("heft", procs1, nprocs1, "sched_ctx1"); */
-	    
-/* 	    p2.ctx = starpu_create_sched_ctx("heft", procs2, nprocs2, "sched_ctx2"); */
-
-	    double flops1 = compute_flops(start1, start1);
-	    double flops2 = compute_flops(start2, start2);
-	    printf("flops1 = %lf flops2 = %lf\n", flops1, flops2);
-	    sched_ctx_hypervisor_handle_ctx(p1.ctx, compute_flops(start1, start1));
-
-	    sched_ctx_hypervisor_handle_ctx(p2.ctx, compute_flops(start2, start2));
-	    
-	    p1.the_other_ctx = p2.ctx;
-	    p2.the_other_ctx = p1.ctx;
-	    
-	    int procs[12];
-	    for(i = 0; i < 12; i++)
-		    procs[i] = i;
-
-	    int gpus[3];
-	    for(i = 0; i < 3; i++)
-		    gpus[i] = i;
-	    sched_ctx_hypervisor_ioctl(p1.ctx,
-				       HYPERVISOR_GRANULARITY, 2,
-				       HYPERVISOR_MIN_TASKS, 10,
-				       HYPERVISOR_MIN_WORKERS, 3,
-				       HYPERVISOR_MAX_WORKERS, 12,
-				       HYPERVISOR_FIXED_WORKERS, gpus, 3,
-//				       HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
-//				       HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
-				       NULL);
-	    
-	    sched_ctx_hypervisor_ioctl(p2.ctx,
-				       HYPERVISOR_GRANULARITY, 2,
-				       HYPERVISOR_MIN_TASKS, 10,
-				       HYPERVISOR_MIN_WORKERS, 0,
-				       HYPERVISOR_MAX_WORKERS, 12,
-				       HYPERVISOR_FIXED_WORKERS, gpus, 3,
-//				       HYPERVISOR_MAX_IDLE, procs, 12, 40000.0,
-//				       HYPERVISOR_MAX_IDLE, gpus, 3, 10000.0,
-				       NULL);
-	    
-    }	
-    else
-    {
-	    p1.ctx = 0;
-	    p2.ctx = 0;
-    }
-    
-    Test( -1, iparam ); /* print header */
-
-    iparam[TIMING_N] = start1;
-    iparam[TIMING_N2] = start2;
-    
-    if ( iparam[TIMING_M] == 0 )
-	    iparam[TIMING_M] = iparam[TIMING_N];
-
-    if ( iparam[TIMING_M2] == 0 )
-	    iparam[TIMING_M2] = iparam[TIMING_N2];
-
-    Test( start1, iparam );    
-    
-    MAGMA_Finalize();
-
-    if(iparam[TIMING_WITH_CTXS])
-	    sched_ctx_hypervisor_shutdown();
-    
-    /* if (gnuplot) { */
-    /*         printf( "%s\n%s\n", */
-    /*                 "e", */
-    /*                 gnuplot > 1 ? "" : "pause 10" ); */
-    /* } */
-
-    return EXIT_SUCCESS;
-}

+ 64 - 34
mpi/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -15,6 +15,7 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
 CC=$(MPICC)
+CCLD=$(MPICC)
 
 if STARPU_MPI_CHECK
 TESTS_ENVIRONMENT	=	$(MPIEXEC) -np 2
@@ -37,7 +38,11 @@ EXTRA_DIST = 					\
 	examples/mpi_lu/pxlu_kernels.h		\
 	examples/mpi_lu/pxlu_kernels.c		\
 	examples/cholesky/mpi_cholesky.h	\
-	examples/cholesky/mpi_cholesky_models.h
+	examples/cholesky/mpi_cholesky_models.h \
+	tests/helper.h
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libstarpumpi.pc starpumpi-1.0.pc
 
 examplebindir = $(libdir)/starpu/examples/mpi
 
@@ -58,27 +63,32 @@ NVCCFLAGS += -I$(top_srcdir)/include/ -I$(top_builddir)/include
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS)
-LIBS = $(top_builddir)/src/libstarpu.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/src/  -I$(top_srcdir)/examples/ -I$(top_builddir)/src -I$(top_builddir)/include
 AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
 
-lib_LTLIBRARIES = libstarpumpi.la
+lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
-libstarpumpi_la_LIBADD = $(top_builddir)/src/libstarpu.la
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE)
 
 noinst_HEADERS =					\
 	starpu_mpi_private.h				\
-	starpu_mpi_fxt.h
+	starpu_mpi_fxt.h				\
+	starpu_mpi_insert_task_cache.h
 
-include_HEADERS =					\
+versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
+versinclude_HEADERS = 				\
 	starpu_mpi.h					\
 	starpu_mpi_datatype.h
 
-libstarpumpi_la_SOURCES =				\
+libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi.c					\
 	starpu_mpi_helper.c				\
 	starpu_mpi_datatype.c				\
 	starpu_mpi_insert_task.c			\
+	starpu_mpi_insert_task_cache.c			\
 	starpu_mpi_collective.c
 
 ###################
@@ -89,7 +99,7 @@ examplebin_PROGRAMS +=				\
 	examples/stencil/stencil5
 
 examples_stencil_stencil5_LDADD =		\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 check_PROGRAMS	+=	\
 	examples/stencil/stencil5
@@ -105,7 +115,7 @@ examplebin_PROGRAMS += 				\
 	examples/mpi_lu/plu_example_double
 
 examples_mpi_lu_plu_example_float_LDADD =	\
-	libstarpumpi.la				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la				\
 	$(STARPU_LIBNUMA_LDFLAGS)		\
 	$(STARPU_BLAS_LDFLAGS)
 
@@ -117,7 +127,7 @@ examples_mpi_lu_plu_example_float_SOURCES =	\
 	$(top_srcdir)/examples/common/blas.c
 
 examples_mpi_lu_plu_example_double_LDADD =	\
-	libstarpumpi.la				\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la				\
 	$(STARPU_LIBNUMA_LDFLAGS)		\
 	$(STARPU_BLAS_LDFLAGS)
 
@@ -145,7 +155,7 @@ examples_cholesky_mpi_cholesky_SOURCES	=		\
 	$(top_srcdir)/examples/common/blas.c
 
 examples_cholesky_mpi_cholesky_LDADD =			\
-	libstarpumpi.la					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la					\
 	$(STARPU_BLAS_LDFLAGS)
 
 examples_cholesky_mpi_cholesky_distributed_SOURCES =	\
@@ -155,7 +165,7 @@ examples_cholesky_mpi_cholesky_distributed_SOURCES =	\
 	$(top_srcdir)/examples/common/blas.c
 
 examples_cholesky_mpi_cholesky_distributed_LDADD =	\
-	libstarpumpi.la					\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la					\
 	$(STARPU_BLAS_LDFLAGS)
 
 check_PROGRAMS +=					\
@@ -171,11 +181,28 @@ examplebin_PROGRAMS +=		\
 	examples/scatter_gather/mpi_scatter_gather
 
 examples_scatter_gather_mpi_scatter_gather_LDADD =	\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 check_PROGRAMS +=		\
 	examples/scatter_gather/mpi_scatter_gather
 
+###################
+# Reduction       #
+###################
+
+examplebin_PROGRAMS +=		\
+	examples/reduction/mpi_reduction
+
+examples_reduction_mpi_reduction_SOURCES =		\
+	examples/reduction/mpi_reduction.c		\
+	examples/reduction/mpi_reduction_kernels.c
+
+examples_reduction_mpi_reduction_LDADD =	\
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
+
+check_PROGRAMS +=		\
+	examples/reduction/mpi_reduction
+
 ########################
 # Unit testcases       #
 ########################
@@ -223,43 +250,43 @@ noinst_PROGRAMS =					\
 	tests/multiple_send
 
 tests_mpi_isend_LDADD =					\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_mpi_irecv_LDADD =					\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_mpi_isend_detached_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_mpi_irecv_detached_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_mpi_detached_tag_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_pingpong_LDADD =					\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_mpi_test_LDADD =					\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_ring_LDADD =					\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_ring_async_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_ring_async_implicit_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_block_interface_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_block_interface_pinned_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_cache_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_block_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_owner_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_owner2_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_insert_task_owner_data_LDADD =			\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 tests_multiple_send_LDADD =				\
-	libstarpumpi.la
+	libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
 tests_ring_SOURCES = tests/ring.c
 tests_ring_async_SOURCES = tests/ring_async.c
@@ -269,3 +296,6 @@ tests_ring_SOURCES += tests/ring_kernel.cu
 tests_ring_async_SOURCES += tests/ring_kernel.cu
 tests_ring_async_implicit_SOURCES += tests/ring_kernel.cu
 endif
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 62 - 36
mpi/examples/cholesky/mpi_cholesky.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,41 +24,45 @@
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &chol_model_11
 };
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &chol_model_21
 };
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &chol_model_22
 };
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return (x+y) % nb_nodes;
 }
 
@@ -70,25 +74,29 @@ static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblo
 {
 	struct timeval start;
 	struct timeval end;
-        starpu_data_handle **data_handles;
+        starpu_data_handle_t **data_handles;
         int x, y;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-        data_handles = malloc(nblocks*sizeof(starpu_data_handle *));
-        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle));
+        data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
-        for(x = 0; x < nblocks ;  x++) {
-                for (y = 0; y < nblocks; y++) {
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
                         int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank) {
+                        if (mpi_rank == rank)
+			{
                                 //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
                                                             ld, size/nblocks, size/nblocks, sizeof(float));
                         }
 			/* TODO: make better test to only registering what is needed */
-                        else {
+                        else
+			{
                                 /* I don't own that index, but will need it for my computations */
                                 //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
@@ -144,8 +152,10 @@ static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblo
 
         starpu_task_wait_for_all();
 
-        for(x = 0; x < nblocks ;  x++) {
-                for (y = 0; y < nblocks; y++) {
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
                         if (data_handles[x][y])
                                 starpu_data_unregister(data_handles[x][y]);
                 }
@@ -209,7 +219,8 @@ int main(int argc, char **argv)
 	}
 
 
-        if (display) {
+        if (display)
+	{
                 printf("[%d] Input :\n", rank);
 
 		for(y=0 ; y<nblocks ; y++)
@@ -221,10 +232,12 @@ int main(int argc, char **argv)
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
 					{
-						if (i <= j) {
+						if (i <= j)
+						{
 							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
 						}
-						else {
+						else
+						{
 							printf(".\t");
 						}
 					}
@@ -238,7 +251,8 @@ int main(int argc, char **argv)
 
 	starpu_mpi_shutdown();
 
-        if (display) {
+        if (display)
+	{
                 printf("[%d] Results :\n", rank);
 		for(y=0 ; y<nblocks ; y++)
 		{
@@ -249,10 +263,12 @@ int main(int argc, char **argv)
 				{
 					for (i = 0; i < BLOCKSIZE; i++)
 					{
-						if (i <= j) {
+						if (i <= j)
+						{
 							printf("%2.2f\t", bmat[y][x][j +i*BLOCKSIZE]);
 						}
-						else {
+						else
+						{
 							printf(".\t");
 						}
 					}
@@ -263,10 +279,14 @@ int main(int argc, char **argv)
 	}
 
 	float *rmat = malloc(size*size*sizeof(float));
-        for(x=0 ; x<nblocks ; x++) {
-                for(y=0 ; y<nblocks ; y++) {
-                        for (i = 0; i < BLOCKSIZE; i++) {
-                                for (j = 0; j < BLOCKSIZE; j++) {
+        for(x=0 ; x<nblocks ; x++)
+	{
+                for(y=0 ; y<nblocks ; y++)
+		{
+                        for (i = 0; i < BLOCKSIZE; i++)
+			{
+                                for (j = 0; j < BLOCKSIZE; j++)
+				{
                                         rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = bmat[x][y][j +i*BLOCKSIZE];
                                 }
                         }
@@ -278,7 +298,8 @@ int main(int argc, char **argv)
 	{
 		for (i = 0; i < size; i++)
 		{
-			if (i > j) {
+			if (i > j)
+			{
 				rmat[j+i*size] = 0.0f; // debug
 			}
 		}
@@ -290,15 +311,18 @@ int main(int argc, char **argv)
 				rmat, size, 0.0f, test_mat, size);
 
 	fprintf(stderr, "[%d] comparing results ...\n", rank);
-        if (display) {
+        if (display)
+	{
                 for (j = 0; j < size; j++)
 		{
                         for (i = 0; i < size; i++)
 			{
-                                if (i <= j) {
+                                if (i <= j)
+				{
                                         printf("%2.2f\t", test_mat[j +i*size]);
                                 }
-                                else {
+                                else
+				{
                                         printf(".\t");
                                 }
                         }
@@ -312,7 +336,8 @@ int main(int argc, char **argv)
                 for (y = 0; y < nblocks; y++)
 		{
                         int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank) {
+                        if (mpi_rank == rank)
+			{
                                 for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
                                 {
                                         for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
@@ -321,7 +346,8 @@ int main(int argc, char **argv)
                                                 {
                                                         float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
                                                         float err = abs(test_mat[j +i*size] - orig);
-                                                        if (err > 0.00001) {
+                                                        if (err > 0.00001)
+							{
                                                                 fprintf(stderr, "[%d] Error[%d, %d] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
 								correctness = 0;
 								break;

+ 14 - 7
mpi/examples/cholesky/mpi_cholesky.h

@@ -51,31 +51,38 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args);
 static void __attribute__((unused)) parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-size") == 0)
+		{
 		        char *argptr;
 			size = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-nblocks") == 0) {
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
 		        char *argptr;
 			nblocks = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-nbigblocks") == 0) {
+		if (strcmp(argv[i], "-nbigblocks") == 0)
+		{
 		        char *argptr;
 			nbigblocks = strtol(argv[++i], &argptr, 10);
 		}
 
-		if (strcmp(argv[i], "-no-prio") == 0) {
+		if (strcmp(argv[i], "-no-prio") == 0)
+		{
 			noprio = 1;
 		}
 
-		if (strcmp(argv[i], "-display") == 0) {
+		if (strcmp(argv[i], "-display") == 0)
+		{
 			display = 1;
 		}
 
-		if (strcmp(argv[i], "-h") == 0) {
+		if (strcmp(argv[i], "-h") == 0)
+		{
 			printf("usage : %s [-display] [-size size] [-nblocks nblocks]\n", argv[0]);
 		}
 	}

+ 34 - 22
mpi/examples/cholesky/mpi_cholesky_distributed.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,41 +24,45 @@
  *	Create the codelets
  */
 
-static starpu_codelet cl11 =
+static struct starpu_codelet cl11 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u11,
+	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u11,
+	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &chol_model_11
 };
 
-static starpu_codelet cl21 =
+static struct starpu_codelet cl21 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u21,
+	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u21,
+	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &chol_model_21
 };
 
-static starpu_codelet cl22 =
+static struct starpu_codelet cl22 =
 {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = chol_cpu_codelet_update_u22,
+	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = chol_cublas_codelet_update_u22,
+	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
 #endif
 	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &chol_model_22
 };
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return (x+y) % nb_nodes;
 }
 
@@ -70,28 +74,32 @@ static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblo
 {
 	struct timeval start;
 	struct timeval end;
-        starpu_data_handle **data_handles;
+        starpu_data_handle_t **data_handles;
         int x, y;
 
 	/* create all the DAG nodes */
 	unsigned i,j,k;
 
-        data_handles = malloc(nblocks*sizeof(starpu_data_handle *));
-        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle));
+        data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
+        for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	gettimeofday(&start, NULL);
 
-        for(x = 0; x < nblocks ;  x++) {
-                for (y = 0; y < nblocks; y++) {
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
                         int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank) {
+                        if (mpi_rank == rank)
+			{
                                 //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)matA[x][y],
                                                             ld, size/nblocks, size/nblocks, sizeof(float));
                         }
 			/* TODO: make better test to only registering what is needed */
-                        else {
+                        else
+			{
                                 /* I don't own that index, but will need it for my computations */
                                 //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
@@ -144,8 +152,10 @@ static void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblo
 
         starpu_task_wait_for_all();
 
-        for(x = 0; x < nblocks ;  x++) {
-                for (y = 0; y < nblocks; y++) {
+        for(x = 0; x < nblocks ;  x++)
+	{
+                for (y = 0; y < nblocks; y++)
+		{
                         if (data_handles[x][y])
                                 starpu_data_unregister(data_handles[x][y]);
                 }
@@ -197,7 +207,8 @@ int main(int argc, char **argv)
                 for(y=0 ; y<nblocks ; y++)
 		{
                         int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank) {
+                        if (mpi_rank == rank)
+			{
 				starpu_malloc((void **)&bmat[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
 				for (i = 0; i < BLOCKSIZE; i++)
 				{
@@ -220,7 +231,8 @@ int main(int argc, char **argv)
                 for(y=0 ; y<nblocks ; y++)
 		{
                         int mpi_rank = my_distrib(x, y, nodes);
-                        if (mpi_rank == rank) {
+                        if (mpi_rank == rank)
+			{
 				starpu_free((void *)bmat[x][y]);
 			}
 		}

+ 21 - 17
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -29,7 +29,7 @@
 #endif
 
 /*
- *   U22 
+ *   U22
  */
 
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
@@ -51,15 +51,16 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 	cublasStatus st;
 #endif
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
-			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21, 
+			SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
 				right, ld12, 1.0f, center, ld22);
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-			cublasSgemm('n', 't', dy, dx, dz, 
-					-1.0f, left, ld21, right, ld12, 
+			cublasSgemm('n', 't', dy, dx, dz,
+					-1.0f, left, ld21, right, ld12,
 					 1.0f, center, ld22);
 			st = cublasGetError();
 			STARPU_ASSERT(!st);
@@ -86,7 +87,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
-/* 
+/*
  * U21
  */
 
@@ -105,7 +106,8 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 			STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
 			break;
@@ -131,25 +133,26 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
 {
 	chol_common_codelet_update_u21(descr, 1, _args);
 }
-#endif 
+#endif
 
 /*
  *	U11
  */
 
-static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args) 
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
 {
 //	printf("11\n");
 	float *sub11;
 
-	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 
+	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
 
 	unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
 	unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
 
 	unsigned z;
 
-	switch (s) {
+	switch (s)
+	{
 		case 0:
 
 			/*
@@ -165,10 +168,10 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				sub11[z+z*ld] = lambda11;
 
 				STARPU_ASSERT(lambda11 != 0.0f);
-		
+
 				SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
-		
-				SSYR("L", nx - z - 1, -1.0f, 
+
+				SSYR("L", nx - z - 1, -1.0f,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
@@ -180,7 +183,8 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				int ret;
 				int info;
 				ret = magma_spotrf_gpu('L', nx, sub11, ld, &info);
-				if (ret != MAGMA_SUCCESS) {
+				if (ret != MAGMA_SUCCESS)
+				{
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
@@ -195,7 +199,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				cudaStreamSynchronize(0);
 
 				STARPU_ASSERT(lambda11 != 0.0f);
-				
+
 				lambda11 = sqrt(lambda11);
 
 				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
@@ -206,7 +210,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
-		
+
 			cudaThreadSynchronize();
 #endif
 			break;

+ 7 - 9
mpi/examples/cholesky/mpi_cholesky_models.c

@@ -18,25 +18,23 @@
 #include "mpi_cholesky_models.h"
 
 /*
- * As a convention, in that file, descr[0] is represented by A,
- * 				  descr[1] is B ...
+ *	Number of flops of Gemm
  */
 
-/*
- *	Number of flops of Gemm 
- */
-
-struct starpu_perfmodel_t chol_model_11 = {
+struct starpu_perfmodel chol_model_11 =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_11"
 };
 
-struct starpu_perfmodel_t chol_model_21 = {
+struct starpu_perfmodel chol_model_21 =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_21"
 };
 
-struct starpu_perfmodel_t chol_model_22 = {
+struct starpu_perfmodel chol_model_22 =
+{
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "chol_model_22"
 };

+ 3 - 3
mpi/examples/cholesky/mpi_cholesky_models.h

@@ -20,8 +20,8 @@
 
 #include <starpu.h>
 
-extern struct starpu_perfmodel_t chol_model_11;
-extern struct starpu_perfmodel_t chol_model_21;
-extern struct starpu_perfmodel_t chol_model_22;
+extern struct starpu_perfmodel chol_model_11;
+extern struct starpu_perfmodel chol_model_21;
+extern struct starpu_perfmodel chol_model_22;
 
 #endif // __DW_CHOLESKY_MODELS_H__

+ 27 - 27
mpi/examples/mpi_lu/plu_example.c

@@ -40,27 +40,27 @@ static unsigned numa = 0;
 static size_t allocated_memory = 0;
 static size_t allocated_memory_extra = 0;
 
-static starpu_data_handle *dataA_handles;
+static starpu_data_handle_t *dataA_handles;
 static TYPE **dataA;
 
 /* In order to implement the distributed LU decomposition, we allocate
  * temporary buffers */
 #ifdef SINGLE_TMP11
-static starpu_data_handle tmp_11_block_handle;
+static starpu_data_handle_t tmp_11_block_handle;
 static TYPE *tmp_11_block;
 #else
-static starpu_data_handle *tmp_11_block_handles;
+static starpu_data_handle_t *tmp_11_block_handles;
 static TYPE **tmp_11_block;
 #endif
 #ifdef SINGLE_TMP1221
-static starpu_data_handle *tmp_12_block_handles;
+static starpu_data_handle_t *tmp_12_block_handles;
 static TYPE **tmp_12_block;
-static starpu_data_handle *tmp_21_block_handles;
+static starpu_data_handle_t *tmp_21_block_handles;
 static TYPE **tmp_21_block;
 #else
-static starpu_data_handle *(tmp_12_block_handles[2]);
+static starpu_data_handle_t *(tmp_12_block_handles[2]);
 static TYPE **(tmp_12_block[2]);
-static starpu_data_handle *(tmp_21_block_handles[2]);
+static starpu_data_handle_t *(tmp_21_block_handles[2]);
 static TYPE **(tmp_21_block[2]);
 #endif
 
@@ -127,34 +127,34 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 }
 
 #ifdef SINGLE_TMP11
-starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void)
 {
 	return tmp_11_block_handle;
 }
 #else
-starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
 {
 	return tmp_11_block_handles[k];
 }
 #endif
 
 #ifdef SINGLE_TMP1221
-starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
 {
 	return tmp_12_block_handles[j];
 }
 
-starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
 {
 	return tmp_21_block_handles[i];
 }
 #else
-starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k)
 {
 	return tmp_12_block_handles[k%2][j];
 }
 
-starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
 {
 	return tmp_21_block_handles[k%2][i];
 }
@@ -203,9 +203,9 @@ static void init_matrix(int rank)
 #endif
 
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
-	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle));
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
 	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
-	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle) + sizeof(TYPE *));
+	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 
 	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
 
@@ -216,8 +216,8 @@ static void init_matrix(int rank)
 		for (i = 0; i < nblocks; i++)
 		{
 			TYPE **blockptr = &dataA[j+i*nblocks];
-//			starpu_data_handle *handleptr = &dataA_handles[j+nblocks*i];
-			starpu_data_handle *handleptr = &dataA_handles[j+nblocks*i];
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
 
 			if (get_block_rank(i, j) == rank)
 			{
@@ -261,9 +261,9 @@ static void init_matrix(int rank)
 	starpu_matrix_data_register(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 #else
-	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
 	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
-	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle) + sizeof(TYPE *));
+	allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 
 	for (k = 0; k < nblocks; k++)
 	{
@@ -282,20 +282,20 @@ static void init_matrix(int rank)
 
 	/* tmp buffers 12 and 21 */
 #ifdef SINGLE_TMP1221
-	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle));
-	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
+	tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t));
 	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
 	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
 
-	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle) + sizeof(TYPE *));
+	allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 #else
 	for (i = 0; i < 2; i++) {
-		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle));
-		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle));
+		tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
+		tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t));
 		tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *));
 		tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *));
 
-		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle) + sizeof(TYPE *));
+		allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
 	}
 #endif
 	
@@ -365,7 +365,7 @@ int get_block_rank(unsigned i, unsigned j)
 	return (j % q) * p + (i % p);
 }
 
-starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
 {
 	return dataA_handles[j+i*nblocks];
 }
@@ -385,7 +385,7 @@ static void display_grid(int rank, unsigned nblocks)
 			for (i = 0; i < nblocks; i++)
 			{
 				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
-				starpu_data_handle handle = STARPU_PLU(get_block_handle)(i, j);
+				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
 
 				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
 			}

+ 41 - 49
mpi/examples/mpi_lu/pxlu.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -90,7 +90,7 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 /* Send handle to every node appearing in the mask, and unlock tag once the
  * transfers are done. */
-static void send_data_to_mask(starpu_data_handle handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
 {
 	unsigned cnt = 0;
 
@@ -99,7 +99,7 @@ static void send_data_to_mask(starpu_data_handle handle, int *rank_mask, int mpi
 	int rank_array[world_size];
 	MPI_Comm comm_array[world_size];
 	int mpi_tag_array[world_size];
-	starpu_data_handle handle_array[world_size];
+	starpu_data_handle_t handle_array[world_size];
 
 	unsigned r;
 	for (r = 0; r < world_size; r++)
@@ -132,7 +132,7 @@ static void send_data_to_mask(starpu_data_handle handle, int *rank_mask, int mpi
 struct recv_when_done_callback_arg {
 	int source;
 	int mpi_tag;
-	starpu_data_handle handle;
+	starpu_data_handle_t handle;
 	starpu_tag_t unlocked_tag;
 };
 
@@ -148,7 +148,7 @@ static void callback_receive_when_done(void *_arg)
 
 static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
 				int source, int mpi_tag,
-				starpu_data_handle handle,
+				starpu_data_handle_t handle,
 				starpu_tag_t partial_tag,
 				starpu_tag_t unlocked_tag)
 {
@@ -206,9 +206,9 @@ static void create_task_11_recv(unsigned k)
 	
 	int source = get_block_rank(k, k);
 #ifdef SINGLE_TMP11
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)();
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)();
 #else
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
 #endif
 	int mpi_tag = MPI_TAG11(k);
 	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
@@ -250,7 +250,7 @@ static void callback_task_11_real(void *_arg)
 	rank_mask[rank] = 0;
 
 	/* Send the block to those nodes */
-	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, k);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
 	starpu_tag_t tag = TAG11_SAVE(k);
 	int mpi_tag = MPI_TAG11(k);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
@@ -267,8 +267,7 @@ static void create_task_11_real(unsigned k)
 	task->cl_arg = create_debug_info(k, k, k);
 
 	/* which sub-data is manipulated ? */
-	task->buffers[0].handle = STARPU_PLU(get_block_handle)(k, k);
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = STARPU_PLU(get_block_handle)(k, k);
 
 	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
 		arg->k = k;
@@ -357,9 +356,9 @@ static void create_task_12_recv(unsigned k, unsigned j)
 	
 	int source = get_block_rank(k, j);
 #ifdef SINGLE_TMP1221
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j);
 #else
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
 #endif
 	int mpi_tag = MPI_TAG12(k, j);
 	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
@@ -394,7 +393,7 @@ static void callback_task_12_real(void *_arg)
 	rank_mask[rank] = 0;
 
 	/* Send the block to those nodes */
-	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, j);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
 	starpu_tag_t tag = TAG12_SAVE(k, j);
 	int mpi_tag = MPI_TAG12(k, j);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
@@ -417,7 +416,7 @@ static void create_task_12_real(unsigned k, unsigned j)
 	starpu_tag_t tag_11_dep; 
 
 	/* which sub-data is manipulated ? */
-	starpu_data_handle diag_block;
+	starpu_data_handle_t diag_block;
 	if (diag_block_is_local)
 	{
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
@@ -433,15 +432,13 @@ static void create_task_12_real(unsigned k, unsigned j)
 		tag_11_dep = TAG11_SAVE(k);
 	}
 
-	task->buffers[0].handle = diag_block; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = STARPU_PLU(get_block_handle)(k, j); 
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(k, j); 
 
 	STARPU_ASSERT(get_block_rank(k, j) == rank);
 
-	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
-	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
 
 	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
 		arg->j = j;
@@ -529,9 +526,9 @@ static void create_task_21_recv(unsigned k, unsigned i)
 
 	int source = get_block_rank(i, k);
 #ifdef SINGLE_TMP1221
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i);
 #else
-	starpu_data_handle block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
 #endif
 	int mpi_tag = MPI_TAG21(k, i);
 	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
@@ -567,7 +564,7 @@ static void callback_task_21_real(void *_arg)
 	rank_mask[rank] = 0;
 
 	/* Send the block to those nodes */
-	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(i, k);
+	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
 	starpu_tag_t tag = TAG21_SAVE(k, i);
 	int mpi_tag = MPI_TAG21(k, i);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
@@ -590,7 +587,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 	starpu_tag_t tag_11_dep; 
 	
 	/* which sub-data is manipulated ? */
-	starpu_data_handle diag_block;
+	starpu_data_handle_t diag_block;
 	if (diag_block_is_local)
 	{
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
@@ -606,13 +603,11 @@ static void create_task_21_real(unsigned k, unsigned i)
 		tag_11_dep = TAG11_SAVE(k);
 	}
 
-	task->buffers[0].handle = diag_block; 
-	task->buffers[0].mode = STARPU_R;
-	task->buffers[1].handle = STARPU_PLU(get_block_handle)(i, k);
-	task->buffers[1].mode = STARPU_RW;
+	task->handles[0] = diag_block; 
+	task->handles[1] = STARPU_PLU(get_block_handle)(i, k);
 
-	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
-	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
 
 	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
 		arg->i = i;
@@ -685,7 +680,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	unsigned block21_is_local = (get_block_rank(i, k) == rank);
 	starpu_tag_t tag_21_dep;
 
-	starpu_data_handle block21;
+	starpu_data_handle_t block21;
 	if (block21_is_local)
 	{
 		block21 = STARPU_PLU(get_block_handle)(i, k);
@@ -705,7 +700,7 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	unsigned block12_is_local = (get_block_rank(k, j) == rank);
 	starpu_tag_t tag_12_dep;
 
-	starpu_data_handle block12;
+	starpu_data_handle_t block12;
 	if (block12_is_local)
 	{
 	//	block12 = STARPU_PLU(get_block_handle)(j, k);
@@ -725,21 +720,18 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
 
 #warning temporary fix :/
-	//task->buffers[0].handle = block21;
-	task->buffers[0].handle = block12;
-	task->buffers[0].mode = STARPU_R;
+	//task->handles[0] = block21;
+	task->handles[0] = block12;
 
-	//task->buffers[1].handle = block12;
-	task->buffers[1].handle = block21;
-	task->buffers[1].mode = STARPU_R;
+	//task->handles[1] = block12;
+	task->handles[1] = block21;
 
 	/* produced by TAG22(k-1, i, j) */
-	task->buffers[2].handle = STARPU_PLU(get_block_handle)(i, j);
-	task->buffers[2].mode = STARPU_RW;
+	task->handles[2] = STARPU_PLU(get_block_handle)(i, j);
 
-	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
-	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
-	STARPU_ASSERT(task->buffers[2].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[0] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[1] != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->handles[2] != STARPU_POISON_PTR);
 
 	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
 		task->priority = STARPU_MAX_PRIO;
@@ -768,7 +760,7 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 //	}
 }
 
-static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle handle)
+static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle_t handle)
 {
 	STARPU_ASSERT(handle != STARPU_POISON_PTR);
 
@@ -788,7 +780,7 @@ static void wait_termination(void)
 		/* Wait task 11k if needed */
 		if (get_block_rank(k, k) == rank)
 		{
-			starpu_data_handle diag_block = STARPU_PLU(get_block_handle)(k, k);
+			starpu_data_handle_t diag_block = STARPU_PLU(get_block_handle)(k, k);
 			wait_tag_and_fetch_handle(TAG11_SAVE(k), diag_block);
 		}
 		
@@ -798,8 +790,8 @@ static void wait_termination(void)
 			/* Wait task 21ki if needed */
 			if (get_block_rank(i, k) == rank)
 			{
-				starpu_data_handle block21 = STARPU_PLU(get_block_handle)(i, k);
-				//starpu_data_handle block21 = STARPU_PLU(get_block_handle)(k, i);
+				starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(i, k);
+				//starpu_data_handle_t block21 = STARPU_PLU(get_block_handle)(k, i);
 				//fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
 				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
 			}
@@ -810,8 +802,8 @@ static void wait_termination(void)
 			/* Wait task 12kj if needed */
 			if (get_block_rank(k, j) == rank)
 			{
-				//starpu_data_handle block12 = STARPU_PLU(get_block_handle)(j, k);
-				starpu_data_handle block12 = STARPU_PLU(get_block_handle)(k, j);
+				//starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(j, k);
+				starpu_data_handle_t block12 = STARPU_PLU(get_block_handle)(k, j);
 				//fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
 				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
 			}

+ 7 - 7
mpi/examples/mpi_lu/pxlu.h

@@ -48,19 +48,19 @@ unsigned STARPU_PLU(display_flag)(void);
 
 void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
 void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
-starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
 TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
 #ifdef SINGLE_TMP11
-starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(void);
 #else
-starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
+starpu_data_handle_t STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
 #endif
 #ifdef SINGLE_TMP1221
-starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
-starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
 #else
-starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
-starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
+starpu_data_handle_t STARPU_PLU(get_tmp_12_block_handle)(unsigned j, unsigned k);
+starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k);
 #endif
 
 void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);

+ 21 - 17
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -96,7 +96,7 @@ static void STARPU_PLU(cublas_u22)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
-static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
+static struct starpu_perfmodel STARPU_PLU(model_22) = {
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_22_atlas)
@@ -107,13 +107,14 @@ static struct starpu_perfmodel_t STARPU_PLU(model_22) = {
 #endif
 };
 
-starpu_codelet STARPU_PLU(cl22) = {
+struct starpu_codelet STARPU_PLU(cl22) = {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_PLU(cpu_u22),
+	.cpu_funcs = {STARPU_PLU(cpu_u22), NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_PLU(cublas_u22),
+	.cuda_funcs = {STARPU_PLU(cublas_u22), NULL},
 #endif
 	.nbuffers = 3,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &STARPU_PLU(model_22)
 };
 
@@ -203,7 +204,7 @@ static void STARPU_PLU(cublas_u12)(void *descr[], void *_args)
 }
 #endif // STARPU_USE_CUDA
 
-static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
+static struct starpu_perfmodel STARPU_PLU(model_12) = {
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_12_atlas)
@@ -214,13 +215,14 @@ static struct starpu_perfmodel_t STARPU_PLU(model_12) = {
 #endif
 };
 
-starpu_codelet STARPU_PLU(cl12) = {
+struct starpu_codelet STARPU_PLU(cl12) = {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_PLU(cpu_u12),
+	.cpu_funcs = {STARPU_PLU(cpu_u12), NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_PLU(cublas_u12),
+	.cuda_funcs = {STARPU_PLU(cublas_u12), NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &STARPU_PLU(model_12)
 };
 
@@ -311,7 +313,7 @@ static void STARPU_PLU(cublas_u21)(void *descr[], void *_args)
 }
 #endif 
 
-static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
+static struct starpu_perfmodel STARPU_PLU(model_21) = {
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_21_atlas)
@@ -322,13 +324,14 @@ static struct starpu_perfmodel_t STARPU_PLU(model_21) = {
 #endif
 };
 
-starpu_codelet STARPU_PLU(cl21) = {
+struct starpu_codelet STARPU_PLU(cl21) = {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_PLU(cpu_u21),
+	.cpu_funcs = {STARPU_PLU(cpu_u21), NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_PLU(cublas_u21),
+	.cuda_funcs = {STARPU_PLU(cublas_u21), NULL},
 #endif
 	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
 	.model = &STARPU_PLU(model_21)
 };
 
@@ -416,7 +419,7 @@ static void STARPU_PLU(cublas_u11)(void *descr[], void *_args)
 }
 #endif// STARPU_USE_CUDA
 
-static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
+static struct starpu_perfmodel STARPU_PLU(model_11) = {
 	.type = STARPU_HISTORY_BASED,
 #ifdef STARPU_ATLAS
 	.symbol = STARPU_PLU_STR(lu_model_11_atlas)
@@ -427,13 +430,14 @@ static struct starpu_perfmodel_t STARPU_PLU(model_11) = {
 #endif
 };
 
-starpu_codelet STARPU_PLU(cl11) = {
+struct starpu_codelet STARPU_PLU(cl11) = {
 	.where = STARPU_CPU|STARPU_CUDA,
-	.cpu_func = STARPU_PLU(cpu_u11),
+	.cpu_funcs = {STARPU_PLU(cpu_u11), NULL},
 #ifdef STARPU_USE_CUDA
-	.cuda_func = STARPU_PLU(cublas_u11),
+	.cuda_funcs = {STARPU_PLU(cublas_u11), NULL},
 #endif
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.model = &STARPU_PLU(model_11)
 };
 

+ 4 - 4
mpi/examples/mpi_lu/pxlu_kernels.h

@@ -24,9 +24,9 @@
 #define xstr(s)        str(s)
 #define STARPU_PLU_STR(name)  xstr(STARPU_PLU(name))
 
-starpu_codelet STARPU_PLU(cl11);
-starpu_codelet STARPU_PLU(cl12);
-starpu_codelet STARPU_PLU(cl21);
-starpu_codelet STARPU_PLU(cl22);
+struct starpu_codelet STARPU_PLU(cl11);
+struct starpu_codelet STARPU_PLU(cl12);
+struct starpu_codelet STARPU_PLU(cl21);
+struct starpu_codelet STARPU_PLU(cl22);
 
 #endif // __PXLU_KERNELS_H__

+ 155 - 0
mpi/examples/reduction/mpi_reduction.c

@@ -0,0 +1,155 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+
+extern void init_cpu_func(void *descr[], void *cl_arg);
+extern void redux_cpu_func(void *descr[], void *cl_arg);
+extern void dot_cpu_func(void *descr[], void *cl_arg);
+
+static struct starpu_codelet init_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {init_cpu_func, NULL},
+	.nbuffers = 1,
+	.name = "init_codelet"
+};
+
+static struct starpu_codelet redux_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {redux_cpu_func, NULL},
+	.nbuffers = 2,
+	.name = "redux_codelet"
+};
+
+static struct starpu_codelet dot_codelet =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {dot_cpu_func, NULL},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_REDUX},
+	.name = "dot_codelet"
+};
+
+/* Returns the MPI node number where data indexes index is */
+int my_distrib(int x, int nb_nodes)
+{
+	return x % nb_nodes;
+}
+
+int main(int argc, char **argv)
+{
+        int my_rank, size, x, y;
+        long int *vector;
+	long int dot, sum;
+        starpu_data_handle_t *handles;
+	starpu_data_handle_t dot_handle;
+
+	int nb_elements, step;
+
+	starpu_init(NULL);
+	starpu_mpi_initialize_extended(&my_rank, &size);
+
+	nb_elements = size*8000;
+	step = 4;
+
+	vector = (long int *) malloc(nb_elements*sizeof(vector[0]));
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			for(y=0 ; y<step ; y++)
+			{
+				vector[x+y] = x+y+1;
+			}
+		}
+        }
+	if (my_rank == 0) {
+		dot = 14;
+		sum = (nb_elements * (nb_elements + 1)) / 2;
+		sum+= dot;
+		starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(dot));
+	}
+	else
+	{
+		starpu_variable_data_register(&dot_handle, -1, (uintptr_t)NULL, sizeof(dot));
+	}
+
+
+	handles = (starpu_data_handle_t *) malloc(nb_elements*sizeof(handles[0]));
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		int mpi_rank = my_distrib(x/step, size);
+		if (mpi_rank == my_rank)
+		{
+			/* Owning data */
+			starpu_vector_data_register(&handles[x], 0, (uintptr_t)&(vector[x]), step, sizeof(vector[0]));
+		}
+		else
+		{
+			starpu_vector_data_register(&handles[x], -1, (uintptr_t)NULL, step, sizeof(vector[0]));
+		}
+		if (handles[x])
+		{
+			starpu_data_set_rank(handles[x], mpi_rank);
+			starpu_data_set_tag(handles[x], x);
+		}
+	}
+
+	starpu_data_set_rank(dot_handle, 0);
+	starpu_data_set_tag(dot_handle, nb_elements+1);
+	starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
+
+	for (x = 0; x < nb_elements; x+=step)
+	{
+		starpu_mpi_insert_task(MPI_COMM_WORLD,
+				       &dot_codelet,
+				       STARPU_R, handles[x],
+				       STARPU_REDUX, dot_handle,
+				       0);
+	}
+	starpu_mpi_redux_data(MPI_COMM_WORLD, dot_handle);
+
+        fprintf(stderr, "Waiting ...\n");
+        starpu_task_wait_for_all();
+
+        for(x = 0; x < nb_elements; x+=step)
+	{
+		if (handles[x]) starpu_data_unregister(handles[x]);
+	}
+	if (dot_handle)
+	{
+		starpu_data_unregister(dot_handle);
+	}
+	free(vector);
+	free(handles);
+
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	if (my_rank == 0)
+	{
+                fprintf(stderr, "[%d] sum=%ld\n", my_rank, sum);
+                fprintf(stderr, "[%d] dot=%ld\n", my_rank, dot);
+		fprintf(stderr, "%s when computing reduction\n", (sum == dot) ? "Success" : "Error");
+        }
+
+	return 0;
+}
+

+ 66 - 0
mpi/examples/reduction/mpi_reduction_kernels.c

@@ -0,0 +1,66 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <mpi.h>
+
+#define _DISPLAY(fmt, args ...) do { \
+		int _display_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_display_rank);	\
+		fprintf(stderr, "[%d][%s] " fmt , _display_rank, __func__ ,##args); 	\
+		fflush(stderr); } while(0)
+
+/*
+ *	Codelet to create a neutral element
+ */
+void init_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dot = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	*dot = 0;
+	_DISPLAY("Init dot\n");
+}
+
+/*
+ *	Codelet to perform the reduction of two elements
+ */
+void redux_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *dota = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+	long int *dotb = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+	*dota = *dota + *dotb;
+	_DISPLAY("Calling redux %ld=%ld+%ld\n", *dota, *dota-*dotb, *dotb);
+}
+
+/*
+ *	Dot product codelet
+ */
+void dot_cpu_func(void *descr[], void *cl_arg)
+{
+	long int *local_x = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	long int *dot = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+
+//	_DISPLAY("Before dot=%ld (adding %d elements...)\n", *dot, n);
+	unsigned i;
+	for (i = 0; i < n; i++)
+	{
+//		_DISPLAY("Adding %ld\n", local_x[i]);
+		*dot += local_x[i];
+	}
+//	_DISPLAY("After dot=%ld\n", *dot);
+}
+

+ 11 - 9
mpi/examples/scatter_gather/mpi_scatter_gather.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,7 +17,8 @@
 #include <starpu_mpi.h>
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return (x+y) % nb_nodes;
 }
 
@@ -31,7 +32,7 @@ void cpu_codelet(void *descr[], void *_args)
 	float factor;
 
 	block = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
-        starpu_unpack_cl_args(_args, &rank);
+        starpu_codelet_unpack_args(_args, &rank);
 	factor = block[0];
 
 	//fprintf(stderr,"rank %d factor %f\n", rank, factor);
@@ -45,18 +46,19 @@ void cpu_codelet(void *descr[], void *_args)
 	}
 }
 
-static starpu_codelet cl =
+static struct starpu_codelet cl =
 {
 	.where = STARPU_CPU,
-	.cpu_func = cpu_codelet,
-	.nbuffers = 1
+	.cpu_funcs = {cpu_codelet, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 
 int main(int argc, char **argv)
 {
         int rank, nodes;
-	float ***bmat;
-        starpu_data_handle *data_handles;
+	float ***bmat = NULL;
+        starpu_data_handle_t *data_handles;
 
 	unsigned i,j,x,y;
 
@@ -117,7 +119,7 @@ int main(int argc, char **argv)
 #endif
 
 	/* Allocate data handles and register data to StarPU */
-        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle *));
+        data_handles = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t *));
         for(x = 0; x < nblocks ;  x++)
 	{
                 for (y = 0; y < nblocks; y++)

+ 53 - 35
mpi/examples/stencil/stencil5.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,37 +29,41 @@ void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
         *xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
 }
 
-starpu_codelet stencil5_cl = {
+struct starpu_codelet stencil5_cl =
+{
 	.where = STARPU_CPU,
-	.cpu_func = stencil5_cpu,
-        .nbuffers = 5
+	.cpu_funcs = {stencil5_cpu, NULL},
+        .nbuffers = 5,
+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
 };
 
-#define NITER_DEF 2000
-#define X         15
-#define Y         50
+#define NITER_DEF 500
+#define X         20
+#define Y         20
 
 int display = 0;
 int niter = NITER_DEF;
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
-	/* Cyclic distrib */
+int my_distrib(int x, int y, int nb_nodes)
+{
+	/* Block distrib */
 	return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
-        //	/* Linear distrib */
-        //	return x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * X;
 }
 
 
 static void parse_args(int argc, char **argv)
 {
 	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-iter") == 0) {
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-iter") == 0)
+		{
 			char *argptr;
 			niter = strtol(argv[++i], &argptr, 10);
 		}
-		if (strcmp(argv[i], "-display") == 0) {
+		if (strcmp(argv[i], "-display") == 0)
+		{
 			display = 1;
 		}
 	}
@@ -67,37 +71,45 @@ static void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
-        int rank, size, x, y, loop;
+        int my_rank, size, x, y, loop;
         int value=0, mean=0;
         unsigned matrix[X][Y];
-        starpu_data_handle data_handles[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
 
 	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	starpu_mpi_initialize_extended(&my_rank, &size);
         parse_args(argc, argv);
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
-                        matrix[x][y] = (rank+1)*10 + value;
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
+                        matrix[x][y] = (my_rank+1)*10 + value;
                         value++;
                         mean += matrix[x][y];
                 }
         }
         mean /= value;
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank) {
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                        if (mpi_rank == my_rank)
+			{
+                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
+			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
+			      || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
+			{
                                 /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                         }
-                        else {
+                        else
+			{
                                 /* I know it's useless to allocate anything for this */
                                 data_handles[x][y] = NULL;
                         }
@@ -109,9 +121,12 @@ int main(int argc, char **argv)
                 }
         }
 
-        for(loop=0 ; loop<niter; loop++) {
-                for (x = 1; x < X-1; x++) {
-                        for (y = 1; y < Y-1; y++) {
+        for(loop=0 ; loop<niter; loop++)
+	{
+                for (x = 1; x < X-1; x++)
+		{
+                        for (y = 1; y < Y-1; y++)
+			{
                                 starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
                                                        STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
                                                        STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
@@ -125,11 +140,14 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-        if (display) {
-                fprintf(stdout, "[%d] mean=%d\n", rank, mean);
-                for(x = 0; x < X; x++) {
-                        fprintf(stdout, "[%d] ", rank);
-                        for (y = 0; y < Y; y++) {
+        if (display)
+	{
+                fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
+                for(x = 0; x < X; x++)
+		{
+                        fprintf(stdout, "[%d] ", my_rank);
+                        for (y = 0; y < Y; y++)
+			{
                                 fprintf(stdout, "%3d ", matrix[x][y]);
                         }
                         fprintf(stdout, "\n");

+ 29 - 0
mpi/libstarpumpi.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ -DSTARPU_USE_DEPRECATED_API
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: libstarpu
+Requires.private:

+ 125 - 125
mpi/starpu_mpi.c

@@ -27,13 +27,13 @@
 //#define USE_STARPU_ACTIVITY	1
 
 static void submit_mpi_req(void *arg);
-static void handle_request_termination(struct starpu_mpi_req_s *req);
+static void handle_request_termination(struct _starpu_mpi_req *req);
 
 /* The list of requests that have been newly submitted by the application */
-static starpu_mpi_req_list_t new_requests;
+static struct _starpu_mpi_req_list *new_requests;
 
 /* The list of detached requests that have already been submitted to MPI */
-static starpu_mpi_req_list_t detached_requests;
+static struct _starpu_mpi_req_list *detached_requests;
 static pthread_mutex_t detached_requests_mutex;
 
 static pthread_cond_t cond;
@@ -45,13 +45,13 @@ static int running = 0;
 static pthread_mutex_t mutex_posted_requests;
 static int posted_requests = 0;
 
-#define INC_POSTED_REQUESTS(value) { PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
+#define INC_POSTED_REQUESTS(value) { _STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; _STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
 
 /*
  *	Isend
  */
 
-static void starpu_mpi_isend_func(struct starpu_mpi_req_s *req)
+static void starpu_mpi_isend_func(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 	void *ptr = starpu_mpi_handle_to_ptr(req->data_handle);
@@ -66,18 +66,18 @@ static void starpu_mpi_isend_func(struct starpu_mpi_req_s *req)
 	TRACE_MPI_ISEND(req->srcdst, req->mpi_tag, 0);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
-	PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	req->submitted = 1;
-	PTHREAD_COND_BROADCAST(&req->req_cond);
-	PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
-static struct starpu_mpi_req_s *_starpu_mpi_isend_common(starpu_data_handle data_handle,
-				int dest, int mpi_tag, MPI_Comm comm,
-				unsigned detached, void (*callback)(void *), void *arg)
+static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle,
+							int dest, int mpi_tag, MPI_Comm comm,
+							unsigned detached, void (*callback)(void *), void *arg)
 {
-	struct starpu_mpi_req_s *req = calloc(1, sizeof(struct starpu_mpi_req_s));
+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(req);
 
         _STARPU_MPI_LOG_IN();
@@ -87,8 +87,8 @@ static struct starpu_mpi_req_s *_starpu_mpi_isend_common(starpu_data_handle data
 	/* Initialize the request structure */
 	req->submitted = 0;
 	req->completed = 0;
-	PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	PTHREAD_COND_INIT(&req->req_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
 
 	req->request_type = SEND_REQ;
 
@@ -111,12 +111,12 @@ static struct starpu_mpi_req_s *_starpu_mpi_isend_common(starpu_data_handle data
 	return req;
 }
 
-int starpu_mpi_isend(starpu_data_handle data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int mpi_tag, MPI_Comm comm)
 {
         _STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(public_req);
 
-	struct starpu_mpi_req_s *req;
+	struct _starpu_mpi_req *req;
 	req = _starpu_mpi_isend_common(data_handle, dest, mpi_tag, comm, 0, NULL, NULL);
 
 	STARPU_ASSERT(req);
@@ -130,7 +130,7 @@ int starpu_mpi_isend(starpu_data_handle data_handle, starpu_mpi_req *public_req,
  *	Isend (detached)
  */
 
-int starpu_mpi_isend_detached(starpu_data_handle data_handle,
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle,
 				int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
         _STARPU_MPI_LOG_IN();
@@ -144,7 +144,7 @@ int starpu_mpi_isend_detached(starpu_data_handle data_handle,
  *	Irecv
  */
 
-static void starpu_mpi_irecv_func(struct starpu_mpi_req_s *req)
+static void starpu_mpi_irecv_func(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 	void *ptr = starpu_mpi_handle_to_ptr(req->data_handle);
@@ -158,25 +158,25 @@ static void starpu_mpi_irecv_func(struct starpu_mpi_req_s *req)
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
-	PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	req->submitted = 1;
-	PTHREAD_COND_BROADCAST(&req->req_cond);
-	PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
-static struct starpu_mpi_req_s *_starpu_mpi_irecv_common(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
+static struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, unsigned detached, void (*callback)(void *), void *arg)
 {
         _STARPU_MPI_LOG_IN();
-	struct starpu_mpi_req_s *req = calloc(1, sizeof(struct starpu_mpi_req_s));
+	struct _starpu_mpi_req *req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(req);
 
         INC_POSTED_REQUESTS(1);
 
 	/* Initialize the request structure */
 	req->submitted = 0;
-	PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
-	PTHREAD_COND_INIT(&req->req_cond, NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&req->req_mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&req->req_cond, NULL);
 
 	req->request_type = RECV_REQ;
 
@@ -200,12 +200,12 @@ static struct starpu_mpi_req_s *_starpu_mpi_irecv_common(starpu_data_handle data
 	return req;
 }
 
-int starpu_mpi_irecv(starpu_data_handle data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, int mpi_tag, MPI_Comm comm)
 {
         _STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(public_req);
 
-	struct starpu_mpi_req_s *req;
+	struct _starpu_mpi_req *req;
 	req = _starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 0, NULL, NULL);
 
 	STARPU_ASSERT(req);
@@ -219,7 +219,7 @@ int starpu_mpi_irecv(starpu_data_handle data_handle, starpu_mpi_req *public_req,
  *	Irecv (detached)
  */
 
-int starpu_mpi_irecv_detached(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 {
         _STARPU_MPI_LOG_IN();
 	_starpu_mpi_irecv_common(data_handle, source, mpi_tag, comm, 1, callback, arg);
@@ -233,7 +233,7 @@ int starpu_mpi_irecv_detached(starpu_data_handle data_handle, int source, int mp
  *	Recv
  */
 
-int starpu_mpi_recv(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 {
 	starpu_mpi_req req;
 
@@ -249,7 +249,7 @@ int starpu_mpi_recv(starpu_data_handle data_handle, int source, int mpi_tag, MPI
  *	Send
  */
 
-int starpu_mpi_send(starpu_data_handle data_handle, int dest, int mpi_tag, MPI_Comm comm)
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
 {
 	starpu_mpi_req req;
 	MPI_Status status;
@@ -268,11 +268,11 @@ int starpu_mpi_send(starpu_data_handle data_handle, int dest, int mpi_tag, MPI_C
  *	Wait
  */
 
-static void starpu_mpi_wait_func(struct starpu_mpi_req_s *waiting_req)
+static void starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 {
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are waiting for ? */
-	struct starpu_mpi_req_s *req = waiting_req->other_request;
+	struct _starpu_mpi_req *req = waiting_req->other_request;
 
 	req->ret = MPI_Wait(&req->request, waiting_req->status);
         STARPU_ASSERT(req->ret == MPI_SUCCESS);
@@ -285,22 +285,22 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 {
         _STARPU_MPI_LOG_IN();
 	int ret;
-	struct starpu_mpi_req_s *waiting_req = calloc(1, sizeof(struct starpu_mpi_req_s));
+	struct _starpu_mpi_req *waiting_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(waiting_req);
-	struct starpu_mpi_req_s *req = *public_req;
+	struct _starpu_mpi_req *req = *public_req;
 
         INC_POSTED_REQUESTS(1);
 
 	/* We cannot try to complete a MPI request that was not actually posted
 	 * to MPI yet. */
-	PTHREAD_MUTEX_LOCK(&(req->req_mutex));
+	_STARPU_PTHREAD_MUTEX_LOCK(&(req->req_mutex));
 	while (!(req->submitted))
-		PTHREAD_COND_WAIT(&(req->req_cond), &(req->req_mutex));
-	PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
+		_STARPU_PTHREAD_COND_WAIT(&(req->req_cond), &(req->req_mutex));
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&(req->req_mutex));
 
 	/* Initialize the request structure */
-	PTHREAD_MUTEX_INIT(&(waiting_req->req_mutex), NULL);
-	PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&(waiting_req->req_mutex), NULL);
+	_STARPU_PTHREAD_COND_INIT(&(waiting_req->req_cond), NULL);
 	waiting_req->status = status;
 	waiting_req->other_request = req;
 	waiting_req->func = starpu_mpi_wait_func;
@@ -309,10 +309,10 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
 	submit_mpi_req(waiting_req);
 
 	/* We wait for the MPI request to finish */
-	PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	while (!req->completed)
-		PTHREAD_COND_WAIT(&req->req_cond, &req->req_mutex);
-	PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&req->req_cond, &req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 
 	ret = req->ret;
 
@@ -329,11 +329,11 @@ int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  * 	Test
  */
 
-static void starpu_mpi_test_func(struct starpu_mpi_req_s *testing_req)
+static void starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 {
         _STARPU_MPI_LOG_IN();
 	/* Which is the mpi request we are testing for ? */
-	struct starpu_mpi_req_s *req = testing_req->other_request;
+	struct _starpu_mpi_req *req = testing_req->other_request;
 
         _STARPU_MPI_DEBUG("Test request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
 	req->ret = MPI_Test(&req->request, testing_req->flag, testing_req->status);
@@ -345,10 +345,10 @@ static void starpu_mpi_test_func(struct starpu_mpi_req_s *testing_req)
 		handle_request_termination(req);
 	}
 
-	PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&testing_req->req_mutex);
 	testing_req->completed = 1;
-	PTHREAD_COND_SIGNAL(&testing_req->req_cond);
-	PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&testing_req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&testing_req->req_mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
@@ -359,23 +359,23 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
 	STARPU_ASSERT(public_req);
 
-	struct starpu_mpi_req_s *req = *public_req;
+	struct _starpu_mpi_req *req = *public_req;
 
 	STARPU_ASSERT(!req->detached);
 
-	PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	unsigned submitted = req->submitted;
-	PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
 
 	if (submitted)
 	{
-		struct starpu_mpi_req_s *testing_req = calloc(1, sizeof(struct starpu_mpi_req_s));
+		struct _starpu_mpi_req *testing_req = calloc(1, sizeof(struct _starpu_mpi_req));
                 STARPU_ASSERT(testing_req);
-                //		memset(testing_req, 0, sizeof(struct starpu_mpi_req_s));
+                //		memset(testing_req, 0, sizeof(struct _starpu_mpi_req));
 
 		/* Initialize the request structure */
-		PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
-		PTHREAD_COND_INIT(&(testing_req->req_cond), NULL);
+		_STARPU_PTHREAD_MUTEX_INIT(&(testing_req->req_mutex), NULL);
+		_STARPU_PTHREAD_COND_INIT(&(testing_req->req_cond), NULL);
 		testing_req->flag = flag;
 		testing_req->status = status;
 		testing_req->other_request = req;
@@ -387,10 +387,10 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
                 submit_mpi_req(testing_req);
 
 		/* We wait for the test request to finish */
-		PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
+		_STARPU_PTHREAD_MUTEX_LOCK(&(testing_req->req_mutex));
 		while (!(testing_req->completed))
-                        PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
-		PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
+                        _STARPU_PTHREAD_COND_WAIT(&(testing_req->req_cond), &(testing_req->req_mutex));
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&(testing_req->req_mutex));
 
 		ret = testing_req->ret;
 
@@ -415,7 +415,7 @@ int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  *	Barrier
  */
 
-static void starpu_mpi_barrier_func(struct starpu_mpi_req_s *barrier_req)
+static void starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 {
         _STARPU_MPI_LOG_IN();
 
@@ -430,12 +430,12 @@ int starpu_mpi_barrier(MPI_Comm comm)
 {
         _STARPU_MPI_LOG_IN();
 	int ret;
-	struct starpu_mpi_req_s *barrier_req = calloc(1, sizeof(struct starpu_mpi_req_s));
+	struct _starpu_mpi_req *barrier_req = calloc(1, sizeof(struct _starpu_mpi_req));
 	STARPU_ASSERT(barrier_req);
 
 	/* Initialize the request structure */
-	PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
-	PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
+	_STARPU_PTHREAD_MUTEX_INIT(&(barrier_req->req_mutex), NULL);
+	_STARPU_PTHREAD_COND_INIT(&(barrier_req->req_cond), NULL);
 	barrier_req->func = starpu_mpi_barrier_func;
 	barrier_req->request_type = BARRIER_REQ;
 	barrier_req->comm = comm;
@@ -444,10 +444,10 @@ int starpu_mpi_barrier(MPI_Comm comm)
 	submit_mpi_req(barrier_req);
 
 	/* We wait for the MPI request to finish */
-	PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->req_mutex);
 	while (!barrier_req->completed)
-		PTHREAD_COND_WAIT(&barrier_req->req_cond, &barrier_req->req_mutex);
-	PTHREAD_MUTEX_UNLOCK(&barrier_req->req_mutex);
+		_STARPU_PTHREAD_COND_WAIT(&barrier_req->req_cond, &barrier_req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->req_mutex);
 
 	ret = barrier_req->ret;
 
@@ -475,7 +475,7 @@ static char *starpu_mpi_request_type(unsigned request_type)
 }
 #endif
 
-static void handle_request_termination(struct starpu_mpi_req_s *req)
+static void handle_request_termination(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 
@@ -497,25 +497,25 @@ static void handle_request_termination(struct starpu_mpi_req_s *req)
 
 	/* tell anyone potentiallly waiting on the request that it is
 	 * terminated now */
-	PTHREAD_MUTEX_LOCK(&req->req_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&req->req_mutex);
 	req->completed = 1;
-	PTHREAD_COND_BROADCAST(&req->req_cond);
-	PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&req->req_cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&req->req_mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
 static void submit_mpi_req(void *arg)
 {
         _STARPU_MPI_LOG_IN();
-	struct starpu_mpi_req_s *req = arg;
+	struct _starpu_mpi_req *req = arg;
 
         INC_POSTED_REQUESTS(-1);
 
-	PTHREAD_MUTEX_LOCK(&mutex);
-	starpu_mpi_req_list_push_front(new_requests, req);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	_starpu_mpi_req_list_push_front(new_requests, req);
         _STARPU_MPI_DEBUG("Pushing new request type %d\n", req->request_type);
-	PTHREAD_COND_BROADCAST(&cond);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
@@ -528,13 +528,13 @@ static unsigned progression_hook_func(void *arg __attribute__((unused)))
 {
 	unsigned may_block = 1;
 
-	PTHREAD_MUTEX_LOCK(&mutex);
-	if (!starpu_mpi_req_list_empty(detached_requests))
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	if (!_starpu_mpi_req_list_empty(detached_requests))
 	{
-		PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
 		may_block = 0;
 	}
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	return may_block;
 }
@@ -549,17 +549,17 @@ static void test_detached_requests(void)
         _STARPU_MPI_LOG_IN();
 	int flag;
 	MPI_Status status;
-	struct starpu_mpi_req_s *req, *next_req;
+	struct _starpu_mpi_req *req, *next_req;
 
-	PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 
-	for (req = starpu_mpi_req_list_begin(detached_requests);
-		req != starpu_mpi_req_list_end(detached_requests);
+	for (req = _starpu_mpi_req_list_begin(detached_requests);
+		req != _starpu_mpi_req_list_end(detached_requests);
 		req = next_req)
 	{
-		next_req = starpu_mpi_req_list_next(req);
+		next_req = _starpu_mpi_req_list_next(req);
 
-		PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
 
                 //_STARPU_MPI_DEBUG("Test detached request %p - mpitag %d - TYPE %s %d\n", &req->request, req->mpi_tag, (req->request_type == RECV_REQ)?"recv : source":"send : dest", req->srcdst);
 		req->ret = MPI_Test(&req->request, &flag, &status);
@@ -570,10 +570,10 @@ static void test_detached_requests(void)
 			handle_request_termination(req);
 		}
 
-		PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&detached_requests_mutex);
 
 		if (flag)
-			starpu_mpi_req_list_erase(detached_requests, req);
+			_starpu_mpi_req_list_erase(detached_requests, req);
 
 #ifdef STARPU_DEVEL
 #warning TODO fix memleak
@@ -583,11 +583,11 @@ static void test_detached_requests(void)
 		//	free(req);
 	}
 
-	PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&detached_requests_mutex);
         _STARPU_MPI_LOG_OUT();
 }
 
-static void handle_new_request(struct starpu_mpi_req_s *req)
+static void handle_new_request(struct _starpu_mpi_req *req)
 {
         _STARPU_MPI_LOG_IN();
 	STARPU_ASSERT(req);
@@ -598,17 +598,17 @@ static void handle_new_request(struct starpu_mpi_req_s *req)
 
 	if (req->detached)
 	{
-		PTHREAD_MUTEX_LOCK(&mutex);
-		starpu_mpi_req_list_push_front(detached_requests, req);
-		PTHREAD_MUTEX_UNLOCK(&mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		_starpu_mpi_req_list_push_front(detached_requests, req);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 		starpu_wake_all_blocked_workers();
 
 		/* put the submitted request into the list of pending requests
 		 * so that it can be handled by the progression mechanisms */
-		PTHREAD_MUTEX_LOCK(&mutex);
-		PTHREAD_COND_SIGNAL(&cond);
-		PTHREAD_MUTEX_UNLOCK(&mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+		_STARPU_PTHREAD_COND_SIGNAL(&cond);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
         _STARPU_MPI_LOG_OUT();
 }
@@ -638,49 +638,49 @@ static void *progress_thread_func(void *arg)
         }
 
 	/* notify the main thread that the progression thread is ready */
-	PTHREAD_MUTEX_LOCK(&mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 1;
-	PTHREAD_COND_SIGNAL(&cond);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
-	PTHREAD_MUTEX_LOCK(&mutex);
-	while (running || posted_requests || !(starpu_mpi_req_list_empty(new_requests)) || !(starpu_mpi_req_list_empty(detached_requests))) {
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
+	while (running || posted_requests || !(_starpu_mpi_req_list_empty(new_requests)) || !(_starpu_mpi_req_list_empty(detached_requests))) {
 		/* shall we block ? */
-		unsigned block = starpu_mpi_req_list_empty(new_requests);
+		unsigned block = _starpu_mpi_req_list_empty(new_requests);
 
 #ifndef USE_STARPU_ACTIVITY
-		block = block && starpu_mpi_req_list_empty(detached_requests);
+		block = block && _starpu_mpi_req_list_empty(detached_requests);
 #endif
 
 		if (block)
 		{
                         _STARPU_MPI_DEBUG("NO MORE REQUESTS TO HANDLE\n");
-			PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
 		}
 
 		/* test whether there are some terminated "detached request" */
-		PTHREAD_MUTEX_UNLOCK(&mutex);
+		_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		test_detached_requests();
-		PTHREAD_MUTEX_LOCK(&mutex);
+		_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 
 		/* get one request */
-		struct starpu_mpi_req_s *req;
-		while (!starpu_mpi_req_list_empty(new_requests))
+		struct _starpu_mpi_req *req;
+		while (!_starpu_mpi_req_list_empty(new_requests))
 		{
-			req = starpu_mpi_req_list_pop_back(new_requests);
+			req = _starpu_mpi_req_list_pop_back(new_requests);
 
 			/* handling a request is likely to block for a while
 			 * (on a sync_data_with_mem call), we want to let the
 			 * application submit requests in the meantime, so we
 			 * release the lock.  */
-			PTHREAD_MUTEX_UNLOCK(&mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 			handle_new_request(req);
-			PTHREAD_MUTEX_LOCK(&mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 		}
 	}
 
-	STARPU_ASSERT(starpu_mpi_req_list_empty(detached_requests));
-	STARPU_ASSERT(starpu_mpi_req_list_empty(new_requests));
+	STARPU_ASSERT(_starpu_mpi_req_list_empty(detached_requests));
+	STARPU_ASSERT(_starpu_mpi_req_list_empty(new_requests));
         STARPU_ASSERT(posted_requests == 0);
 
         if (initialize_mpi) {
@@ -688,7 +688,7 @@ static void *progress_thread_func(void *arg)
                 MPI_Finalize();
         }
 
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	return NULL;
 }
@@ -735,21 +735,21 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 static
 int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 {
-	PTHREAD_MUTEX_INIT(&mutex, NULL);
-	PTHREAD_COND_INIT(&cond, NULL);
-	new_requests = starpu_mpi_req_list_new();
+	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
+	_STARPU_PTHREAD_COND_INIT(&cond, NULL);
+	new_requests = _starpu_mpi_req_list_new();
 
-	PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
-	detached_requests = starpu_mpi_req_list_new();
+	_STARPU_PTHREAD_MUTEX_INIT(&detached_requests_mutex, NULL);
+	detached_requests = _starpu_mpi_req_list_new();
 
-        PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
+        _STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
 
 	pthread_create(&progress_thread, NULL, progress_thread_func, (void *)&initialize_mpi);
 
-	PTHREAD_MUTEX_LOCK(&mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	while (!running)
-		PTHREAD_COND_WAIT(&cond, &mutex);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+		_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
         if (rank && world_size) {
                 _STARPU_DEBUG("Calling MPI_Comm_rank\n");
@@ -788,10 +788,10 @@ int starpu_mpi_shutdown(void)
 	void *value;
 
 	/* kill the progression thread */
-	PTHREAD_MUTEX_LOCK(&mutex);
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	running = 0;
-	PTHREAD_COND_BROADCAST(&cond);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_COND_BROADCAST(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	pthread_join(progress_thread, &value);
 
@@ -800,8 +800,8 @@ int starpu_mpi_shutdown(void)
 #endif
 
 	/* free the request queues */
-	starpu_mpi_req_list_delete(detached_requests);
-	starpu_mpi_req_list_delete(new_requests);
+	_starpu_mpi_req_list_delete(detached_requests);
+	_starpu_mpi_req_list_delete(new_requests);
 
 	return 0;
 }

+ 24 - 19
mpi/starpu_mpi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,14 +21,18 @@
 #include <starpu.h>
 #include <mpi.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef void *starpu_mpi_req;
 
-int starpu_mpi_isend(starpu_data_handle data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
-int starpu_mpi_irecv(starpu_data_handle data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
-int starpu_mpi_send(starpu_data_handle data_handle, int dest, int mpi_tag, MPI_Comm comm);
-int starpu_mpi_recv(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
-int starpu_mpi_isend_detached(starpu_data_handle data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
-int starpu_mpi_irecv_detached(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm);
+int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status);
+int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
+int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
 int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
 int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
 int starpu_mpi_barrier(MPI_Comm comm);
@@ -36,25 +40,26 @@ int starpu_mpi_initialize(void);
 int starpu_mpi_initialize_extended(int *rank, int *world_size);
 int starpu_mpi_shutdown(void);
 
-int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...);
-void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle data_handle, int node);
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
 
-int starpu_mpi_scatter_detached(starpu_data_handle *data_handles, int count, int root, MPI_Comm comm);
-int starpu_mpi_gather_detached(starpu_data_handle *data_handles, int count, int root, MPI_Comm comm);
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm);
 
 /* Some helper functions */
 
 /* When the transfer is completed, the tag is unlocked */
-int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
-int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag);
 
 /* Asynchronously send an array of buffers, and unlocks the tag once all of
  * them are transmitted. */
-int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
-		starpu_data_handle *data_handle, int *dest, int *mpi_tag,
-		MPI_Comm *comm, starpu_tag_t tag);
-int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size,
-		starpu_data_handle *data_handle, int *source, int *mpi_tag,
-		MPI_Comm *comm, starpu_tag_t tag);
+int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_MPI_H__

+ 2 - 2
mpi/starpu_mpi_collective.c

@@ -18,7 +18,7 @@
 #include <starpu.h>
 #include <starpu_mpi.h>
 
-int starpu_mpi_scatter_detached(starpu_data_handle *data_handles, int count, int root, MPI_Comm comm)
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
 {
 	int rank;
 	int x;
@@ -47,7 +47,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle *data_handles, int count, int
 	return 0;
 }
 
-int starpu_mpi_gather_detached(starpu_data_handle *data_handles, int count, int root, MPI_Comm comm)
+int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm)
 {
 	int rank;
 	int x;

+ 16 - 11
mpi/starpu_mpi_datatype.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -22,14 +22,14 @@
  *	a datatype and the datatype itself, so we need to provide both.
  */
 
-typedef int (*handle_to_datatype_func)(starpu_data_handle, MPI_Datatype *);
-typedef void *(*handle_to_ptr_func)(starpu_data_handle);
+typedef int (*handle_to_datatype_func)(starpu_data_handle_t, MPI_Datatype *);
+typedef void *(*handle_to_ptr_func)(starpu_data_handle_t);
 
 /*
  * 	Matrix
  */
 
-static int handle_to_datatype_matrix(starpu_data_handle data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -51,7 +51,7 @@ static int handle_to_datatype_matrix(starpu_data_handle data_handle, MPI_Datatyp
  * 	Block
  */
 
-static int handle_to_datatype_block(starpu_data_handle data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -82,7 +82,7 @@ static int handle_to_datatype_block(starpu_data_handle data_handle, MPI_Datatype
  * 	Vector
  */
 
-static int handle_to_datatype_vector(starpu_data_handle data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -102,7 +102,7 @@ static int handle_to_datatype_vector(starpu_data_handle data_handle, MPI_Datatyp
  * 	Variable
  */
 
-static int handle_to_datatype_variable(starpu_data_handle data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -121,19 +121,24 @@ static int handle_to_datatype_variable(starpu_data_handle data_handle, MPI_Datat
  *	Generic
  */
 
-static handle_to_datatype_func handle_to_datatype_funcs[STARPU_NINTERFACES_ID] = {
+static handle_to_datatype_func handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
+{
 	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
 	[STARPU_CSR_INTERFACE_ID]	= NULL,
 	[STARPU_BCSR_INTERFACE_ID]	= NULL,
 	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
+	[STARPU_VOID_INTERFACE_ID]      = NULL,
+	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
 };
 
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle data_handle, MPI_Datatype *datatype)
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
-	unsigned id = starpu_get_handle_interface_id(data_handle);
+	enum starpu_data_interface_id id = starpu_handle_get_interface_id(data_handle);
+
+	STARPU_ASSERT_MSG(id <= STARPU_MULTIFORMAT_INTERFACE_ID, "Unknown data interface");
 
 	handle_to_datatype_func func = handle_to_datatype_funcs[id];
 
@@ -142,7 +147,7 @@ int starpu_mpi_handle_to_datatype(starpu_data_handle data_handle, MPI_Datatype *
 	return func(data_handle, datatype);
 }
 
-void *starpu_mpi_handle_to_ptr(starpu_data_handle data_handle)
+void *starpu_mpi_handle_to_ptr(starpu_data_handle_t data_handle)
 {
 	return (void*) starpu_handle_get_local_ptr(data_handle);
 }

+ 11 - 3
mpi/starpu_mpi_datatype.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,7 +20,15 @@
 
 #include <starpu_mpi.h>
 
-int starpu_mpi_handle_to_datatype(starpu_data_handle data_handle, MPI_Datatype *datatype);
-void *starpu_mpi_handle_to_ptr(starpu_data_handle data_handle);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int starpu_mpi_handle_to_datatype(starpu_data_handle_t data_handle, MPI_Datatype *datatype);
+void *starpu_mpi_handle_to_ptr(starpu_data_handle_t data_handle);
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif // __STARPU_MPI_DATATYPE_H__

+ 4 - 4
mpi/starpu_mpi_helper.c

@@ -26,7 +26,7 @@ static void starpu_mpi_unlock_tag_callback(void *arg)
 	free(tagptr);
 }
 
-int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle data_handle,
+int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle,
 				int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
@@ -37,7 +37,7 @@ int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle data_handle,
 }
 
 
-int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 {
 	starpu_tag_t *tagptr = malloc(sizeof(starpu_tag_t));
 	*tagptr = tag;
@@ -65,7 +65,7 @@ static void starpu_mpi_array_unlock_callback(void *_arg)
 }
 
 int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
-		starpu_data_handle *data_handle, int *dest, int *mpi_tag,
+		starpu_data_handle_t *data_handle, int *dest, int *mpi_tag,
 		MPI_Comm *comm, starpu_tag_t tag)
 {
 	struct arg_array *arg = malloc(sizeof(struct arg_array));
@@ -85,7 +85,7 @@ int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size,
 }
 
 
-int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
+int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
 {
 	struct arg_array *arg = malloc(sizeof(struct arg_array));
 

+ 150 - 124
mpi/starpu_mpi_insert_task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,9 +21,10 @@
 #include <starpu.h>
 #include <starpu_data.h>
 #include <common/utils.h>
-#include <common/hash.h>
+#include <starpu_hash.h>
 #include <common/htable32.h>
 #include <util/starpu_insert_task_utils.h>
+#include <datawizard/coherency.h>
 
 //#define STARPU_MPI_VERBOSE	1
 #include <starpu_mpi_private.h>
@@ -31,139 +32,85 @@
 /* Whether we are allowed to keep copies of remote data. Does not work
  * yet: the sender has to know whether the receiver has it, keeping it
  * in an array indexed by node numbers. */
-#define MPI_CACHE
+//#define MPI_CACHE
+#include <starpu_mpi_insert_task_cache.h>
 
-#ifdef MPI_CACHE
-static struct starpu_htbl32_node_s **sent_data = NULL;
-static struct starpu_htbl32_node_s **received_data = NULL;
-
-static void _starpu_mpi_task_init(int nb_nodes)
-{
-        int i;
-
-        _STARPU_MPI_DEBUG("Initialising hash table for cache\n");
-        sent_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node_s *));
-        for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
-        received_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node_s *));
-        for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
-}
-
-typedef struct _starpu_mpi_clear_cache_s {
-        starpu_data_handle data;
-        int rank;
-        int mode;
-} _starpu_mpi_clear_cache_t;
-
-#define _STARPU_MPI_CLEAR_SENT_DATA     0
-#define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
-
-void _starpu_mpi_clear_cache_callback(void *callback_arg)
-{
-        _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
-        uint32_t key = _starpu_crc32_be((uintptr_t)clear_cache->data, 0);
-
-        if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
-                _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
-                _starpu_htbl_insert_32(&sent_data[clear_cache->rank], key, NULL);
-        }
-        else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
-                _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
-                _starpu_htbl_insert_32(&received_data[clear_cache->rank], key, NULL);
-        }
-
-        free(clear_cache);
-}
-
-void _starpu_mpi_clear_cache_request(starpu_data_handle data_handle, int rank, int mode)
+static void _starpu_mpi_tables_init()
 {
-        struct starpu_task *task = starpu_task_create();
-        task->cl = NULL;
-
-        task->buffers[0].handle = data_handle;
-        task->buffers[0].mode = STARPU_RW;
-
-        _starpu_mpi_clear_cache_t *clear_cache = malloc(sizeof(_starpu_mpi_clear_cache_t));
-        clear_cache->data = data_handle;
-        clear_cache->rank = rank;
-        clear_cache->mode = mode;
-
-        task->callback_func = _starpu_mpi_clear_cache_callback;
-        task->callback_arg = clear_cache;
-        starpu_task_submit(task);
+        if (sent_data == NULL) {
+                int nb_nodes;
+		int i;
+
+                MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
+		_STARPU_MPI_DEBUG("Initialising hash table for cache\n");
+		sent_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node *));
+		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
+		received_data = malloc(nb_nodes * sizeof(struct starpu_htbl32_node *));
+		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
+	}
 }
-#endif
 
-void _starpu_data_deallocate(starpu_data_handle data_handle)
+void _starpu_data_deallocate(starpu_data_handle_t data_handle)
 {
 #ifdef STARPU_DEVEL
 #warning _starpu_data_deallocate not implemented yet
 #endif
 }
 
-int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
+int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 {
         int arg_type;
         va_list varg_list;
-        int me, do_execute;
+        int me, do_execute, xrank, nb_nodes;
+	size_t *size_on_nodes;
 	size_t arg_buffer_size = 0;
 	char *arg_buffer;
-        int dest=0, execute, inconsistent_execute;
+        int dest=0, inconsistent_execute;
 
         _STARPU_MPI_LOG_IN();
 
 	MPI_Comm_rank(comm, &me);
+	MPI_Comm_size(comm, &nb_nodes);
 
-#ifdef MPI_CACHE
-        if (sent_data == NULL) {
-                int size;
-                MPI_Comm_size(comm, &size);
-                _starpu_mpi_task_init(size);
-        }
-#endif
+	size_on_nodes = (size_t *)calloc(1, nb_nodes * sizeof(size_t));
+
+	_starpu_mpi_tables_init();
 
         /* Get the number of buffers and the size of the arguments */
 	va_start(varg_list, codelet);
         arg_buffer_size = _starpu_insert_task_get_arg_size(varg_list);
 
 	va_start(varg_list, codelet);
-	_starpu_pack_cl_args(arg_buffer_size, &arg_buffer, varg_list);
+	_starpu_codelet_pack_args(arg_buffer_size, &arg_buffer, varg_list);
 
-        /* Finds out if the property STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA is specified */
-        execute = -1;
+	/* Find out whether we are to execute the data because we own the data to be written to. */
+        inconsistent_execute = 0;
+        do_execute = -1;
+	xrank = -1;
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_EXECUTE_ON_NODE) {
-                        execute = va_arg(varg_list, int);
+                        xrank = va_arg(varg_list, int);
+			_STARPU_MPI_DEBUG("Executing on node %d\n", xrank);
+			do_execute = 1;
                 }
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
-			starpu_data_handle data = va_arg(varg_list, starpu_data_handle);
-                        execute = starpu_data_get_rank(data);
-                }
-		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        va_arg(varg_list, starpu_data_handle);
+			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+                        xrank = starpu_data_get_rank(data);
+			_STARPU_MPI_DEBUG("Executing on data node %d\n", xrank);
+			STARPU_ASSERT(xrank <= nb_nodes);
+			do_execute = 1;
                 }
-		else if (arg_type==STARPU_VALUE) {
-			va_arg(varg_list, void *);
-		}
-		else if (arg_type==STARPU_CALLBACK) {
-			va_arg(varg_list, void (*)(void *));
-		}
-		else if (arg_type==STARPU_CALLBACK_ARG) {
-			va_arg(varg_list, void *);
-		}
-		else if (arg_type==STARPU_PRIORITY) {
-			va_arg(varg_list, int);
-		}
-        }
-	va_end(varg_list);
+		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
+                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
+
+                        if (data && arg_type & STARPU_R) {
+				int rank = starpu_data_get_rank(data);
+				struct starpu_data_interface_ops *ops;
+				ops = data->ops;
+				size_on_nodes[rank] += ops->get_size(data);
+			}
 
-	/* Find out whether we are to execute the data because we own the data to be written to. */
-        inconsistent_execute = 0;
-        do_execute = -1;
-	va_start(varg_list, codelet);
-	while ((arg_type = va_arg(varg_list, int)) != 0) {
-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        starpu_data_handle data = va_arg(varg_list, starpu_data_handle);
                         if (arg_type & STARPU_W) {
                                 if (!data) {
                                         /* We don't have anything allocated for this.
@@ -174,6 +121,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                                          * safeguard. */
                                         _STARPU_MPI_DEBUG("oh oh\n");
                                         _STARPU_MPI_LOG_OUT();
+					free(size_on_nodes);
                                         return -EINVAL;
                                 }
                                 int mpi_rank = starpu_data_get_rank(data);
@@ -202,10 +150,15 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                 }
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
 		}
 		else if (arg_type==STARPU_CALLBACK) {
 			va_arg(varg_list, void (*)(void *));
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG) {
 			va_arg(varg_list, void *);
 		}
@@ -220,36 +173,56 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
 		}
 	}
 	va_end(varg_list);
-        assert(do_execute != -1);
+
+	if (do_execute == -1) {
+		int i;
+		size_t max_size = 0;
+		for(i=0 ; i<nb_nodes ; i++) {
+			if (size_on_nodes[i] > max_size)
+			{
+				max_size = size_on_nodes[i];
+				xrank = i;
+			}
+		}
+		free(size_on_nodes);
+		if (xrank != -1) {
+			_STARPU_MPI_DEBUG("Node %d is having the most R data\n", xrank);
+			do_execute = 1;
+		}
+	}
+
+	STARPU_ASSERT(do_execute != -1 && "StarPU needs to see a W or a REDUX data which will tell it where to execute the task");
 
         if (inconsistent_execute == 1) {
-                if (execute == -1) {
-                        _STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet\n");
-                        return -EINVAL;
+                if (xrank == -1) {
+                        _STARPU_MPI_DEBUG("Different tasks are owning W data. Needs to specify which one is to execute the codelet, using STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA\n");
+			free(size_on_nodes);
+			return -EINVAL;
                 }
                 else {
-                        do_execute = (me == execute);
-                        dest = execute;
+                        do_execute = (me == xrank);
+                        dest = xrank;
                 }
         }
-        else if (execute != -1) {
-                _STARPU_MPI_DEBUG("Property STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA ignored as W data are all owned by the same task\n");
-        }
+	else if (xrank != -1) {
+		do_execute = (me == xrank);
+		dest = xrank;
+	}
 
         /* Send and receive data as requested */
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        starpu_data_handle data = va_arg(varg_list, starpu_data_handle);
+                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
                         if (data && arg_type & STARPU_R) {
                                 int mpi_rank = starpu_data_get_rank(data);
 				int mpi_tag = starpu_data_get_tag(data);
-				STARPU_ASSERT(mpi_tag >= 0);
+				STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
                                 /* The task needs to read this data */
                                 if (do_execute && mpi_rank != me && mpi_rank != -1) {
                                         /* I will have to execute but I don't have the data, receive */
 #ifdef MPI_CACHE
-                                        uint32_t key = _starpu_crc32_be((uintptr_t)data, 0);
+                                        uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
                                         void *already_received = _starpu_htbl_search_32(received_data[mpi_rank], key);
                                         if (!already_received) {
                                                 _starpu_htbl_insert_32(&received_data[mpi_rank], key, data);
@@ -267,7 +240,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                                 if (!do_execute && mpi_rank == me) {
                                         /* Somebody else will execute it, and I have the data, send it. */
 #ifdef MPI_CACHE
-                                        uint32_t key = _starpu_crc32_be((uintptr_t)data, 0);
+                                        uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
                                         void *already_sent = _starpu_htbl_search_32(sent_data[dest], key);
                                         if (!already_sent) {
                                                 _starpu_htbl_insert_32(&sent_data[dest], key, data);
@@ -286,10 +259,15 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                 }
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
 		}
 		else if (arg_type==STARPU_CALLBACK) {
 			va_arg(varg_list, void (*)(void *));
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG) {
 			va_arg(varg_list, void *);
 		}
@@ -300,13 +278,13 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
 			va_arg(varg_list, int);
 		}
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
-			va_arg(varg_list, starpu_data_handle);
+			va_arg(varg_list, starpu_data_handle_t);
 		}
         }
 	va_end(varg_list);
 
 	if (do_execute) {
-                _STARPU_MPI_DEBUG("Execution of the codelet %p\n", codelet);
+                _STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
                 va_start(varg_list, codelet);
                 struct starpu_task *task = starpu_task_create();
                 int ret = _starpu_insert_task_create_and_submit(arg_buffer, codelet, &task, varg_list);
@@ -318,13 +296,13 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                 va_start(varg_list, codelet);
                 while ((arg_type = va_arg(varg_list, int)) != 0) {
                         if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                                starpu_data_handle data = va_arg(varg_list, starpu_data_handle);
+                                starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
                                 if (arg_type & STARPU_W) {
                                         int mpi_rank = starpu_data_get_rank(data);
 					int mpi_tag = starpu_data_get_tag(data);
-					STARPU_ASSERT(mpi_tag >= 0);
+					STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
                                         if (mpi_rank == me) {
-                                                if (execute != -1 && me != execute) {
+                                                if (xrank != -1 && me != xrank) {
                                                         _STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
                                                         starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
                                                 }
@@ -337,10 +315,15 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                         }
                         else if (arg_type==STARPU_VALUE) {
                                 va_arg(varg_list, void *);
+				va_arg(varg_list, size_t);
                         }
                         else if (arg_type==STARPU_CALLBACK) {
                                 va_arg(varg_list, void (*)(void *));
                         }
+                        else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+                                va_arg(varg_list, void (*)(void *));
+                                va_arg(varg_list, void *);
+                        }
                         else if (arg_type==STARPU_CALLBACK_ARG) {
                                 va_arg(varg_list, void *);
                         }
@@ -351,7 +334,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                                 va_arg(varg_list, int);
                         }
                         else if (arg_type==STARPU_EXECUTE_ON_DATA) {
-                                va_arg(varg_list, starpu_data_handle);
+                                va_arg(varg_list, starpu_data_handle_t);
                         }
                 }
                 va_end(varg_list);
@@ -360,10 +343,10 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
 	va_start(varg_list, codelet);
 	while ((arg_type = va_arg(varg_list, int)) != 0) {
 		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
-                        starpu_data_handle data = va_arg(varg_list, starpu_data_handle);
+                        starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
 #ifdef MPI_CACHE
                         if (arg_type & STARPU_W) {
-                                uint32_t key = _starpu_crc32_be((uintptr_t)data, 0);
+                                uint32_t key = starpu_crc32_be((uintptr_t)data, 0);
                                 if (do_execute) {
                                         /* Note that all copies I've sent to neighbours are now invalid */
                                         int n, size;
@@ -400,10 +383,15 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
                 }
 		else if (arg_type==STARPU_VALUE) {
 			va_arg(varg_list, void *);
+			va_arg(varg_list, size_t);
 		}
 		else if (arg_type==STARPU_CALLBACK) {
 			va_arg(varg_list, void (*)(void *));
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG) {
+			va_arg(varg_list, void (*)(void *));
+			va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG) {
 			va_arg(varg_list, void *);
 		}
@@ -414,7 +402,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
 			va_arg(varg_list, int);
 		}
 		else if (arg_type==STARPU_EXECUTE_ON_DATA) {
-			va_arg(varg_list, starpu_data_handle);
+			va_arg(varg_list, starpu_data_handle_t);
 		}
         }
 	va_end(varg_list);
@@ -422,7 +410,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, starpu_codelet *codelet, ...)
         return 0;
 }
 
-void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle data_handle, int node)
+void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 {
         int me, rank;
 
@@ -441,3 +429,41 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle data_handle,
         }
         starpu_task_wait_for_all();
 }
+
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
+{
+        int me, rank, tag, nb_nodes;
+
+        rank = starpu_data_get_rank(data_handle);
+        tag = starpu_data_get_tag(data_handle);
+
+	MPI_Comm_rank(comm, &me);
+	MPI_Comm_size(comm, &nb_nodes);
+
+	_STARPU_MPI_DEBUG("Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
+
+	// need to count how many nodes have the data in redux mode
+	if (me == rank) {
+		int i;
+
+		for(i=0 ; i<nb_nodes ; i++) {
+			if (i != rank) {
+				starpu_data_handle_t new_handle;
+
+				starpu_data_register_same(&new_handle, data_handle);
+
+				_STARPU_MPI_DEBUG("Receiving redux handle from %d in %p ...\n", i, new_handle);
+
+				starpu_mpi_irecv_detached(new_handle, i, tag, comm, NULL, NULL);
+				starpu_insert_task(data_handle->redux_cl,
+						   STARPU_RW, data_handle,
+						   STARPU_R, new_handle,
+						   0);
+			}
+		}
+	}
+	else {
+		_STARPU_MPI_DEBUG("Sending redux handle to %d ...\n", rank);
+		starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
+	}
+}

+ 94 - 0
mpi/starpu_mpi_insert_task_cache.c

@@ -0,0 +1,94 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_insert_task_cache.h>
+#include <starpu_hash.h>
+#include <common/htable32.h>
+
+typedef struct _starpu_mpi_clear_cache_s {
+        starpu_data_handle_t data;
+        int rank;
+        int mode;
+} _starpu_mpi_clear_cache_t;
+
+struct starpu_htbl32_node **sent_data = NULL;
+struct starpu_htbl32_node **received_data = NULL;
+
+void _starpu_mpi_clear_cache_callback(void *callback_arg)
+{
+        _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
+        uint32_t key = starpu_crc32_be((uintptr_t)clear_cache->data, 0);
+
+        if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
+                _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
+                _starpu_htbl_insert_32(&sent_data[clear_cache->rank], key, NULL);
+        }
+        else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
+                _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
+                _starpu_htbl_insert_32(&received_data[clear_cache->rank], key, NULL);
+        }
+
+        free(clear_cache);
+}
+
+double _starpu_mpi_clear_cache_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	return 0;
+}
+
+static struct starpu_perfmodel _starpu_mpi_clear_cache_model =
+{
+	.cost_function = _starpu_mpi_clear_cache_cost_function,
+	.type = STARPU_COMMON,
+};
+
+static void _starpu_mpi_clear_cache_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
+{
+}
+
+static struct starpu_codelet _starpu_mpi_clear_cache_codelet =
+{
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
+	.cpu_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.cuda_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.opencl_funcs = {_starpu_mpi_clear_cache_func, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &_starpu_mpi_clear_cache_model
+	// The model has a cost function which returns 0 so as to allow the codelet to be scheduled anywhere
+};
+
+void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode)
+{
+        struct starpu_task *task = starpu_task_create();
+
+	// We have a codelet with a empty function just to force the
+	// task being created to have a dependency on data_handle
+        task->cl = &_starpu_mpi_clear_cache_codelet;
+        task->handles[0] = data_handle;
+
+        _starpu_mpi_clear_cache_t *clear_cache = malloc(sizeof(_starpu_mpi_clear_cache_t));
+        clear_cache->data = data_handle;
+        clear_cache->rank = rank;
+        clear_cache->mode = mode;
+
+        task->callback_func = _starpu_mpi_clear_cache_callback;
+        task->callback_arg = clear_cache;
+        starpu_task_submit(task);
+}
+

+ 26 - 0
mpi/starpu_mpi_insert_task_cache.h

@@ -0,0 +1,26 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+
+#define _STARPU_MPI_CLEAR_SENT_DATA     0
+#define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
+
+extern struct starpu_htbl32_node **sent_data;
+extern struct starpu_htbl32_node **received_data;
+
+void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode);

+ 19 - 19
mpi/starpu_mpi_private.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,26 +29,26 @@
 //#define STARPU_MPI_VERBOSE	1
 
 #ifdef STARPU_MPI_VERBOSE
-#  define _STARPU_MPI_DEBUG(fmt, args ...) { if (!getenv("STARPU_SILENT")) { \
-    						int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);                     \
-                                                int yyy; for(yyy=0 ; yyy<=rank ; yyy++) fprintf(stderr, "    ");    \
-                                                fprintf(stderr, "[%d][starpu_mpi][%s] " fmt , rank, __func__ ,##args); \
-                                                fflush(stderr); }}
+#  define _STARPU_MPI_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) { \
+    						int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);       \
+                                                int yyy; for(yyy=0 ; yyy<=_debug_rank ; yyy++) fprintf(stderr, "    ");    \
+                                                fprintf(stderr, "[%d][starpu_mpi][%s] " fmt , _debug_rank, __func__ ,##args); \
+                                                fflush(stderr); }} while(0);
 #else
 #  define _STARPU_MPI_DEBUG(fmt, args ...)
 #endif
 
 #ifdef STARPU_MPI_VERBOSE0
-#  define _STARPU_MPI_LOG_IN()             { if (!getenv("STARPU_SILENT")) { \
-                                               int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);                        \
+#  define _STARPU_MPI_LOG_IN()             do { if (!getenv("STARPU_SILENT")) { \
+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
+                                               int yyy; for(yyy=0 ; yyy<=_debug_rank ; yyy++) fprintf(stderr, "    ");      \
+                                               fprintf(stderr, "[%d][starpu_mpi][%s] -->\n", _debug_rank, __func__ ); \
+                                               fflush(stderr); }} while(0)
+#  define _STARPU_MPI_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) { \
+                                               int _debug_rank; MPI_Comm_rank(MPI_COMM_WORLD, &_debug_rank);                        \
                                                int yyy; for(yyy=0 ; yyy<=rank ; yyy++) fprintf(stderr, "    ");      \
-                                               fprintf(stderr, "[%d][starpu_mpi][%s] -->\n", rank, __func__ ); \
-                                               fflush(stderr); }}
-#  define _STARPU_MPI_LOG_OUT()            { if (!getenv("STARPU_SILENT")) { \
-                                               int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);                        \
-                                               int yyy; for(yyy=0 ; yyy<=rank ; yyy++) fprintf(stderr, "    ");      \
-                                               fprintf(stderr, "[%d][starpu_mpi][%s] <--\n", rank, __func__ ); \
-                                               fflush(stderr); }}
+                                               fprintf(stderr, "[%d][starpu_mpi][%s] <--\n", _debug_rank, __func__ ); \
+                                               fflush(stderr); }} while(0)
 #else
 #  define _STARPU_MPI_LOG_IN()
 #  define _STARPU_MPI_LOG_OUT()
@@ -60,9 +60,9 @@
 #define TEST_REQ        3
 #define BARRIER_REQ     4
 
-LIST_TYPE(starpu_mpi_req,
+LIST_TYPE(_starpu_mpi_req,
 	/* description of the data at StarPU level */
-	starpu_data_handle data_handle;
+	starpu_data_handle_t data_handle;
 
 	/* description of the data to be sent/received */
 	MPI_Datatype datatype;
@@ -72,7 +72,7 @@ LIST_TYPE(starpu_mpi_req,
 	int mpi_tag;
 	MPI_Comm comm;
 
-	void (*func)(struct starpu_mpi_req_s *);
+	void (*func)(struct _starpu_mpi_req *);
 
 	MPI_Status *status;
 	MPI_Request request;
@@ -89,7 +89,7 @@ LIST_TYPE(starpu_mpi_req,
 
 	/* In the case of a Wait/Test request, we are going to post a request
 	 * to test the completion of another request */
-	struct starpu_mpi_req_s *other_request;
+	struct _starpu_mpi_req *other_request;
 
 	/* in the case of detached requests */
 	unsigned detached;

+ 29 - 0
mpi/starpumpi-1.0.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpumpi
+Description: offers MPI support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
+Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: starpu-1.0
+Requires.private:

+ 29 - 19
mpi/tests/block_interface.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 
 #include <starpu_mpi.h>
 #include <stdlib.h>
+#include "helper.h"
 
 #define NITER	2048
 
@@ -25,31 +26,32 @@
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
 	/* We only use 2 nodes for that test */
 	if (rank >= 2)
 	{
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
-		
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -57,7 +59,7 @@ int main(int argc, char **argv)
 	 * their blocks. */
 
 	float *block;
-	starpu_data_handle block_handle;
+	starpu_data_handle_t block_handle;
 
 	if (rank == 0)
 	{
@@ -89,13 +91,17 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
 
 		MPI_Status status;
-		starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
 
 		/* check the content of the block */
-		starpu_data_acquire(block_handle, STARPU_R);
+		ret = starpu_data_acquire(block_handle, STARPU_R);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
 		unsigned i, j, k;
 		for (k = 0; k < SIZE; k++)
 		for (j = 0; j < SIZE; j++)
@@ -104,15 +110,18 @@ int main(int argc, char **argv)
 			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
 		}
 		starpu_data_release(block_handle);
-		
+
 	}
 	else /* rank == 1 */
 	{
 		MPI_Status status;
-		starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
 
 		/* check the content of the block and modify it */
-		starpu_data_acquire(block_handle, STARPU_RW);
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
 		unsigned i, j, k;
 		for (k = 0; k < SIZE; k++)
 		for (j = 0; j < SIZE; j++)
@@ -123,10 +132,11 @@ int main(int argc, char **argv)
 		}
 		starpu_data_release(block_handle);
 
-		starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
 	}
 
-	fprintf(stdout, "Rank %d is done\n", rank);
+	FPRINTF(stdout, "Rank %d is done\n", rank);
 	fflush(stdout);
 
 	starpu_mpi_shutdown();

+ 30 - 19
mpi/tests/block_interface_pinned.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 
 #include <starpu_mpi.h>
 #include <stdlib.h>
+#include "helper.h"
 
 #define NITER	2048
 
@@ -25,31 +26,32 @@
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least processes.\n");
+			FPRINTF(stderr, "We need at least processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
 	/* We only use 2 nodes for that test */
 	if (rank >= 2)
 	{
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
-		
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	/* Node 0 will allocate a big block and only register an inner part of
 	 * it as the block data, Node 1 will allocate a block of small size and
@@ -57,7 +59,7 @@ int main(int argc, char **argv)
 	 * their blocks. */
 
 	float *block;
-	starpu_data_handle block_handle;
+	starpu_data_handle_t block_handle;
 
 	if (rank == 0)
 	{
@@ -91,10 +93,13 @@ int main(int argc, char **argv)
 
 	if (rank == 0)
 	{
-		starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
-
 		MPI_Status status;
-		starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+
+		ret = starpu_mpi_send(block_handle, 1, 0x42, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
+		ret = starpu_mpi_recv(block_handle, 1, 0x1337, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
 
 		/* check the content of the block */
 		starpu_data_acquire(block_handle, STARPU_R);
@@ -106,15 +111,19 @@ int main(int argc, char **argv)
 			assert(block[i + j*BIGSIZE + k*BIGSIZE*BIGSIZE] == 33.0f);
 		}
 		starpu_data_release(block_handle);
-		
+
 	}
 	else /* rank == 1 */
 	{
 		MPI_Status status;
-		starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+
+		ret = starpu_mpi_recv(block_handle, 0, 0x42, MPI_COMM_WORLD, &status);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
 
 		/* check the content of the block and modify it */
-		starpu_data_acquire(block_handle, STARPU_RW);
+		ret = starpu_data_acquire(block_handle, STARPU_RW);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire");
+
 		unsigned i, j, k;
 		for (k = 0; k < SIZE; k++)
 		for (j = 0; j < SIZE; j++)
@@ -125,10 +134,12 @@ int main(int argc, char **argv)
 		}
 		starpu_data_release(block_handle);
 
-		starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		ret = starpu_mpi_send(block_handle, 0, 0x1337, MPI_COMM_WORLD);
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_send");
+
 	}
 
-	fprintf(stdout, "Rank %d is done\n", rank);
+	FPRINTF(stdout, "Rank %d is done\n", rank);
 	fflush(stdout);
 
 	starpu_mpi_shutdown();

+ 22 - 0
mpi/tests/helper.h

@@ -0,0 +1,22 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <errno.h>
+
+#define STARPU_TEST_SKIPPED 77
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+

+ 55 - 34
mpi/tests/insert_task.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,27 +16,31 @@
 
 #include <starpu_mpi.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-        fprintf(stdout, "VALUES: %d %d\n", *x, *y);
+        FPRINTF(stdout, "VALUES: %d %d\n", *x, *y);
         *x = (*x + *y) / 2;
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 2
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
 };
 
 #define X     4
 #define Y     5
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return x % nb_nodes;
 }
 
@@ -44,42 +48,51 @@ int my_distrib(int x, int y, int nb_nodes) {
 int main(int argc, char **argv)
 {
         int rank, size, x, y;
-        int value=0;
+        int value=0, ret;
         unsigned matrix[X][Y];
-        starpu_data_handle data_handles[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         matrix[x][y] = (rank+1)*10 + value;
                         value++;
                 }
         }
 #if 0
         for(x = 0; x < X; x++) {
-                fprintf(stdout, "[%d] ", rank);
+                FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < Y; y++) {
-                        fprintf(stdout, "%3d ", matrix[x][y]);
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank) {
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
                                 /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                         }
-                        else {
+                        else
+			{
                                 /* I know it's useless to allocate anything for this */
                                 data_handles[x][y] = NULL;
                         }
@@ -91,16 +104,22 @@ int main(int argc, char **argv)
                 }
         }
 
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 
-        fprintf(stderr, "Waiting ...\n");
+        FPRINTF(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         if (data_handles[x][y])
                                 starpu_data_unregister(data_handles[x][y]);
                 }
@@ -109,12 +128,14 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 #if 0
-        for(x = 0; x < X; x++) {
-                fprintf(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++) {
-                        fprintf(stdout, "%3d ", matrix[x][y]);
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 

+ 61 - 37
mpi/tests/insert_task_block.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,6 +16,7 @@
 
 #include <starpu_mpi.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
@@ -27,29 +28,36 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
         int i, j;
         unsigned sum=0;
 
-	for (i = 0; i < nx; i++) {
-		for (j = 0; j < ny; j++) {
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
                         sum += matrix[i+j*ld];
                 }
         }
-	for (i = 0; i < nx; i++) {
-		for (j = 0; j < ny; j++) {
+	for (i = 0; i < nx; i++)
+	{
+		for (j = 0; j < ny; j++)
+		{
                         matrix[i+j*ld] = sum;///(nx*ny);
                 }
         }
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 1
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 #define SIZE       6
 #define BLOCKS     3
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return x % nb_nodes;
 }
 
@@ -57,44 +65,53 @@ int my_distrib(int x, int y, int nb_nodes) {
 int main(int argc, char **argv)
 {
         int rank, size, x, y;
-        int value=0;
+        int ret, value=0;
         unsigned matrix[SIZE*SIZE];
-        starpu_data_handle data_handles[SIZE][SIZE];
+        starpu_data_handle_t data_handles[SIZE][SIZE];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
-        for(x = 0; x < SIZE; x++) {
-                for (y = 0; y < SIZE; y++) {
+        for(x = 0; x < SIZE; x++)
+	{
+                for (y = 0; y < SIZE; y++)
+		{
                         matrix[x+y*SIZE] = rank*100 + value;
                         value++;
                 }
         }
 #if 1
         for(x = 0; x < SIZE; x++) {
-                fprintf(stdout, "[%d] ", rank);
+                FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < SIZE; y++) {
-                        fprintf(stdout, "%3d ", matrix[x+y*SIZE]);
+                        FPRINTF(stdout, "%3d ", matrix[x+y*SIZE]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 
-        for(x = 0; x < BLOCKS ;  x++) {
-                for (y = 0; y < BLOCKS; y++) {
+        for(x = 0; x < BLOCKS ;  x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
                         int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank) {
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
                                 /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
                                                             SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
                         }
-                        else {
+                        else
+			{
                                 /* I know it's useless to allocate anything for this */
                                 data_handles[x][y] = NULL;
                         }
@@ -106,19 +123,25 @@ int main(int argc, char **argv)
                 }
         }
 
-        for(x = 0; x < BLOCKS; x++) {
-                for (y = 0; y < BLOCKS; y++) {
-                        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
-                                               STARPU_RW, data_handles[x][y],
-                                               0);
+        for(x = 0; x < BLOCKS; x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
+                        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
+						     STARPU_RW, data_handles[x][y],
+						     0);
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+
                 }
         }
 
-        fprintf(stderr, "Waiting ...\n");
+        FPRINTF(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 
-        for(x = 0; x < BLOCKS; x++) {
-                for (y = 0; y < BLOCKS; y++) {
+        for(x = 0; x < BLOCKS; x++)
+	{
+                for (y = 0; y < BLOCKS; y++)
+		{
                         if (data_handles[x][y])
                                 starpu_data_unregister(data_handles[x][y]);
                 }
@@ -128,12 +151,13 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 #if 1
-        for(x = 0; x < SIZE; x++) {
-                fprintf(stdout, "[%d] ", rank);
+        for(x = 0; x < SIZE; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
                 for (y = 0; y < SIZE; y++) {
-                        fprintf(stdout, "%3d ", matrix[x+y*SIZE]);
+                        FPRINTF(stdout, "%3d ", matrix[x+y*SIZE]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 

+ 59 - 36
mpi/tests/insert_task_cache.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,27 +16,31 @@
 
 #include <starpu_mpi.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
 	unsigned *x = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned *y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-        fprintf(stdout, "VALUES: %d %d\n", *x, *y);
+        FPRINTF(stdout, "VALUES: %d %d\n", *x, *y);
         *x = (*x + *y) / 2;
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 2
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
 };
 
 #define X     4
 #define Y     5
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
         return x % nb_nodes;
 }
 
@@ -44,42 +48,53 @@ int my_distrib(int x, int y, int nb_nodes) {
 int main(int argc, char **argv)
 {
         int rank, size, x, y;
-        int value=0;
+        int ret,value=0;
         unsigned matrix[X][Y];
-        starpu_data_handle data_handles[X][Y];
+        starpu_data_handle_t data_handles[X][Y];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         matrix[x][y] = (rank+1)*10 + value;
                         value++;
                 }
         }
 #if 0
-        for(x = 0; x < X; x++) {
-                fprintf(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++) {
-                        fprintf(stdout, "%3d ", matrix[x][y]);
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         int mpi_rank = my_distrib(x, y, size);
-                        if (mpi_rank == rank) {
-                                //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
+                        if (mpi_rank == rank)
+			{
+                                //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
                         }
-                        else if (rank == mpi_rank+1 || rank == mpi_rank-1) {
+                        else if (rank == mpi_rank+1 || rank == mpi_rank-1)
+			{
                                 /* I don't own that index, but will need it for my computations */
-                                //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
+                                //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
                                 starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
                         }
-                        else {
+                        else
+			{
                                 /* I know it's useless to allocate anything for this */
                                 data_handles[x][y] = NULL;
                         }
@@ -91,16 +106,22 @@ int main(int argc, char **argv)
                 }
         }
 
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
-        starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
+        ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
 
-        fprintf(stderr, "Waiting ...\n");
+        FPRINTF(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 
-        for(x = 0; x < X; x++) {
-                for (y = 0; y < Y; y++) {
+        for(x = 0; x < X; x++)
+	{
+                for (y = 0; y < Y; y++)
+		{
                         if (data_handles[x][y])
                                 starpu_data_unregister(data_handles[x][y]);
                 }
@@ -109,12 +130,14 @@ int main(int argc, char **argv)
 	starpu_shutdown();
 
 #if 0
-        for(x = 0; x < X; x++) {
-                fprintf(stdout, "[%d] ", rank);
-                for (y = 0; y < Y; y++) {
-                        fprintf(stdout, "%3d ", matrix[x][y]);
+        for(x = 0; x < X; x++)
+	{
+                FPRINTF(stdout, "[%d] ", rank);
+                for (y = 0; y < Y; y++)
+		{
+                        FPRINTF(stdout, "%3d ", matrix[x][y]);
                 }
-                fprintf(stdout, "\n");
+                FPRINTF(stdout, "\n");
         }
 #endif
 

+ 107 - 69
mpi/tests/insert_task_owner.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,53 +16,82 @@
 
 #include <starpu_mpi.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
-	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
+	int node;
+	int rank;
 
-        *x = *x + 1;
-        *y = *y + 1;
+        starpu_codelet_unpack_args(_args, &node);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	FPRINTF(stderr, "Expected node: %d - Actual node: %d\n", node, rank);
+
+	assert(node == rank);
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet_r_w =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 2
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W}
 };
 
-#define ACQUIRE_DATA \
-        if (rank == 0) starpu_data_acquire(data_handlesx0, STARPU_R);    \
-        if (rank == 1) starpu_data_acquire(data_handlesx1, STARPU_R);    \
-        fprintf(stderr, "[%d] Values: %d %d\n", rank, x0, x1);
-
-#define RELEASE_DATA \
-        if (rank == 0) starpu_data_release(data_handlesx0); \
-        if (rank == 1) starpu_data_release(data_handlesx1); \
+struct starpu_codelet mycodelet_rw_r =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_R}
+};
 
-#define CHECK_RESULT \
-        if (rank == 0) assert(x0 == vx0[0] && x1 == vx1[0]); \
-        if (rank == 1) assert(x0 == vx0[1] && x1 == vx1[1]);
+struct starpu_codelet mycodelet_rw_rw =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
+};
 
-int main(int argc, char **argv)
+struct starpu_codelet mycodelet_w_r =
 {
-        int rank, size, err;
-        int x0=0, x1=0, vx0[2] = {x0, x0}, vx1[2]={x1,x1};
-        starpu_data_handle data_handlesx0;
-        starpu_data_handle data_handlesx1;
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_W, STARPU_R}
+};
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+struct starpu_codelet mycodelet_r_r =
+{
+	.where = STARPU_CPU,
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_R, STARPU_R}
+};
 
-        if (size != 2) {
-		if (rank == 0) fprintf(stderr, "We need exactly 2 processes.\n");
+int main(int argc, char **argv)
+{
+        int ret, rank, size, err, node;
+        int x0=32, x1=23;
+        starpu_data_handle_t data_handlesx0;
+        starpu_data_handle_t data_handlesx1;
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
+
+        if (size != 2)
+	{
+		if (rank == 0) FPRINTF(stderr, "We need exactly 2 processes.\n");
                 starpu_mpi_shutdown();
                 starpu_shutdown();
-                return 0;
+                return STARPU_TEST_SKIPPED;
         }
 
-        if (rank == 0) {
+        if (rank == 0)
+	{
                 starpu_variable_data_register(&data_handlesx0, 0, (uintptr_t)&x0, sizeof(x0));
                 starpu_data_set_rank(data_handlesx0, rank);
 		starpu_data_set_tag(data_handlesx0, 0);
@@ -70,7 +99,8 @@ int main(int argc, char **argv)
                 starpu_data_set_rank(data_handlesx1, 1);
 		starpu_data_set_tag(data_handlesx1, 1);
         }
-        else if (rank == 1) {
+        else if (rank == 1)
+	{
                 starpu_variable_data_register(&data_handlesx1, 0, (uintptr_t)&x1, sizeof(x1));
                 starpu_data_set_rank(data_handlesx1, rank);
 		starpu_data_set_tag(data_handlesx1, 1);
@@ -79,60 +109,68 @@ int main(int argc, char **argv)
 		starpu_data_set_tag(data_handlesx0, 0);
         }
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, 0);
+	node = starpu_data_get_rank(data_handlesx1);
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx1[1]++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1, 0);
+	node = starpu_data_get_rank(data_handlesx0);
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_R, data_handlesx1,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx0[0] ++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, 0);
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1,
+				     0);
         assert(err == -EINVAL);
-        ACQUIRE_DATA;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, 1, 0);
+	node = 1;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
+        assert(err == 0);
+
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_rw_rw,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx0[0] ++ ; vx1[1] ++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handlesx0, STARPU_RW, data_handlesx1, STARPU_EXECUTE_ON_NODE, 0, 0);
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx0[0] ++ ; vx1[1] ++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
         /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
-           going to be ignored as the data model clearly specifies
-           which task is going to execute the codelet */
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, 12, 0);
+           going to overwrite the node even though the data model clearly specifies
+           which node is going to execute the codelet */
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_r_w,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_R, data_handlesx0, STARPU_W, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx1[1] ++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
         /* Here the value specified by the property STARPU_EXECUTE_ON_NODE is
-           going to be ignored as the data model clearly specifies
-           which task is going to execute the codelet */
-        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, 11, 0);
+           going to overwrite the node even though the data model clearly specifies
+           which node is going to execute the codelet */
+	node = 0;
+        err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet_w_r,
+				     STARPU_VALUE, &node, sizeof(node),
+				     STARPU_W, data_handlesx0, STARPU_R, data_handlesx1, STARPU_EXECUTE_ON_NODE, node,
+				     0);
         assert(err == 0);
-        ACQUIRE_DATA;
-        vx0[0] ++;
-        CHECK_RESULT;
-        RELEASE_DATA;
 
+	fprintf(stderr, "Waiting ...\n");
         starpu_task_wait_for_all();
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 33 - 22
mpi/tests/insert_task_owner2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
@@ -25,43 +26,50 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	int *x2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
 	int *y = (int *)STARPU_VARIABLE_GET_PTR(descr[3]);
 
-//        fprintf(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+//        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
 //
 //        *x2 = 45;
 //        *y = 144;
 //
-        fprintf(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
         *y = (*x0 + *x1) * 100;
         *x1 = 12;
         *x2 = 24;
         *x0 = 36;
-        fprintf(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
+        FPRINTF(stderr, "-------> CODELET VALUES: %d %d %d %d\n", *x0, *x1, *x2, *y);
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 4
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 4,
+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
 };
 
 int main(int argc, char **argv)
 {
         int rank, size, err;
         int x[3], y=0;
-        int i;
-        starpu_data_handle data_handles[4];
+        int i, ret;
+        starpu_data_handle_t data_handles[4];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
-        if (rank > 1) {
+        if (rank > 1)
+	{
                 starpu_mpi_shutdown();
                 starpu_shutdown();
-                return 0;
+                return STARPU_TEST_SKIPPED;
         }
 
-        if (rank == 0) {
-                for(i=0 ; i<3 ; i++) {
+        if (rank == 0)
+	{
+                for(i=0 ; i<3 ; i++)
+		{
                         x[i] = 10*(i+1);
                         starpu_variable_data_register(&data_handles[i], 0, (uintptr_t)&x[i], sizeof(x[i]));
                         starpu_data_set_rank(data_handles[i], rank);
@@ -72,8 +80,10 @@ int main(int argc, char **argv)
                 starpu_data_set_rank(data_handles[3], 1);
 		starpu_data_set_tag(data_handles[3], 3);
         }
-        else if (rank == 1) {
-                for(i=0 ; i<3 ; i++) {
+        else if (rank == 1)
+	{
+                for(i=0 ; i<3 ; i++)
+		{
                         x[i] = -1;
                         starpu_variable_data_register(&data_handles[i], -1, (uintptr_t)NULL, sizeof(int));
                         starpu_data_set_rank(data_handles[i], 0);
@@ -84,24 +94,25 @@ int main(int argc, char **argv)
                 starpu_data_set_rank(data_handles[3], rank);
 		starpu_data_set_tag(data_handles[3], 3);
         }
-        fprintf(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+        FPRINTF(stderr, "[%d][init] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
 
         err = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet,
                                      STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
                                      STARPU_W, data_handles[2],
                                      STARPU_W, data_handles[3],
                                      STARPU_EXECUTE_ON_NODE, 1, 0);
-        assert(err == 0);
+	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_insert_task");
         starpu_task_wait_for_all();
 
         int *values = malloc(4 * sizeof(int *));
-        for(i=0 ; i<4 ; i++) {
+        for(i=0 ; i<4 ; i++)
+	{
                 starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[i], 0);
                 starpu_data_acquire(data_handles[i], STARPU_R);
                 values[i] = *((int *)starpu_mpi_handle_to_ptr(data_handles[i]));
         }
-        fprintf(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
-        fprintf(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d %d %d\n", rank, values[0], values[1], values[2], values[3]);
+        FPRINTF(stderr, "[%d][end] VALUES: %d %d %d %d\n", rank, x[0], x[1], x[2], y);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 23 - 14
mpi/tests/insert_task_owner_data.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,7 @@
 #include <starpu_mpi.h>
 #include <starpu_mpi_datatype.h>
 #include <math.h>
+#include "helper.h"
 
 void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 {
@@ -27,30 +28,36 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	*x1 *= *x1;
 }
 
-starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
 	.where = STARPU_CPU,
-	.cpu_func = func_cpu,
-        .nbuffers = 2
+	.cpu_funcs = {func_cpu, NULL},
+        .nbuffers = 2,
+	.modes = {STARPU_RW, STARPU_RW}
 };
 
 int main(int argc, char **argv)
 {
         int rank, size, err;
         int x[2];
-        int i;
-        starpu_data_handle data_handles[2];
+        int ret, i;
+        starpu_data_handle_t data_handles[2];
 	int values[2];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
-        if (rank > 1) {
+        if (rank > 1)
+	{
                 starpu_mpi_shutdown();
                 starpu_shutdown();
-                return 0;
+                return STARPU_TEST_SKIPPED;
         }
 
-        if (rank == 0) {
+        if (rank == 0)
+	{
 		x[0] = 11;
 		starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x[0], sizeof(x[0]));
 		starpu_data_set_rank(data_handles[0], 0);
@@ -59,7 +66,8 @@ int main(int argc, char **argv)
 		starpu_data_set_rank(data_handles[1], 1);
 		starpu_data_set_tag(data_handles[1],10);
         }
-        else if (rank == 1) {
+        else if (rank == 1)
+	{
 		x[1] = 12;
 		starpu_variable_data_register(&data_handles[0], -1, (uintptr_t)NULL, sizeof(x[0]));
 		starpu_data_set_rank(data_handles[0], 0);
@@ -76,13 +84,14 @@ int main(int argc, char **argv)
         assert(err == 0);
         starpu_task_wait_for_all();
 
-        for(i=0 ; i<2 ; i++) {
+        for(i=0 ; i<2 ; i++)
+	{
                 starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[i], 0);
                 starpu_data_acquire(data_handles[i], STARPU_R);
                 values[i] = *((int *)starpu_mpi_handle_to_ptr(data_handles[i]));
         }
 	assert(values[0] == 12 && values[1] == 144);
-        fprintf(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
+        FPRINTF(stderr, "[%d][local ptr] VALUES: %d %d\n", rank, values[0], values[1]);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 14 - 11
mpi/tests/mpi_detached_tag.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,33 +16,35 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -61,13 +63,14 @@ int main(int argc, char **argv)
 		{
 			starpu_mpi_isend_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
 		}
-		else {
+		else
+		{
 			starpu_mpi_irecv_detached_unlock_tag(tab_handle, other_rank, loop, MPI_COMM_WORLD, tag);
 		}
 
 		starpu_tag_wait(tag);
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 14 - 11
mpi/tests/mpi_irecv.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,33 +16,35 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -59,14 +61,15 @@ int main(int argc, char **argv)
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_req req;
 			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 			starpu_mpi_wait(&req, &status);
 		}
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 20 - 17
mpi/tests/mpi_irecv_detached.c

@@ -17,12 +17,13 @@
 
 #include <starpu_mpi.h>
 #include <common/utils.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
@@ -30,34 +31,35 @@ static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 void callback(void *arg __attribute__((unused)))
 {
 	unsigned *received = arg;
-	
-	PTHREAD_MUTEX_LOCK(&mutex);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	*received = 1;
-	PTHREAD_COND_SIGNAL(&cond);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 }
 
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -74,17 +76,18 @@ int main(int argc, char **argv)
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
-		else {
+		else
+		{
 			int received = 0;
 			starpu_mpi_irecv_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &received);
 
-			PTHREAD_MUTEX_LOCK(&mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 			while (!received)
-				PTHREAD_COND_WAIT(&cond, &mutex);
-			PTHREAD_MUTEX_UNLOCK(&mutex);
+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		}
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 14 - 11
mpi/tests/mpi_isend.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,33 +16,35 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -62,12 +64,13 @@ int main(int argc, char **argv)
 			starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 			starpu_mpi_wait(&req, &status);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 		}
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 20 - 17
mpi/tests/mpi_isend_detached.c

@@ -18,12 +18,13 @@
 #include <starpu_mpi.h>
 #include <common/utils.h>
 #include <pthread.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 static float *tab;
-static starpu_data_handle tab_handle;
+static starpu_data_handle_t tab_handle;
 
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
@@ -31,33 +32,34 @@ static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
 void callback(void *arg __attribute__((unused)))
 {
 	unsigned *sent = arg;
-	
-	PTHREAD_MUTEX_LOCK(&mutex);
+
+	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 	*sent = 1;
-	PTHREAD_COND_SIGNAL(&cond);
-	PTHREAD_MUTEX_UNLOCK(&mutex);
+	_STARPU_PTHREAD_COND_SIGNAL(&cond);
+	_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 }
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -75,17 +77,18 @@ int main(int argc, char **argv)
 			int sent = 0;
 			starpu_mpi_isend_detached(tab_handle, other_rank, loop, MPI_COMM_WORLD, callback, &sent);
 
-			PTHREAD_MUTEX_LOCK(&mutex);
+			_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
 			while (!sent)
-				PTHREAD_COND_WAIT(&cond, &mutex);
-			PTHREAD_MUTEX_UNLOCK(&mutex);
+				_STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
+			_STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 		}
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 18 - 13
mpi/tests/mpi_test.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,33 +16,35 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -61,17 +63,20 @@ int main(int argc, char **argv)
 		{
                         starpu_mpi_isend(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 		}
-		else {
+		else
+		{
 			starpu_mpi_irecv(tab_handle, &req, other_rank, loop, MPI_COMM_WORLD);
 		}
 
 		int finished = 0;
-		do {
+		do
+		{
 			MPI_Status status;
 			starpu_mpi_test(&req, &finished, &status);
-		} while (!finished);
+		}
+		while (!finished);
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 26 - 16
mpi/tests/multiple_send.c

@@ -15,29 +15,32 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 
 int main(int argc, char **argv)
 {
-	int rank, size;
+	int ret, rank, size;
         unsigned send[2] = {42, 11};
         unsigned recv[2] = {33, 33};
         starpu_mpi_req req[2];
-        starpu_data_handle send_handle[2];
-        starpu_data_handle recv_handle[2];
+        starpu_data_handle_t send_handle[2];
+        starpu_data_handle_t recv_handle[2];
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least 2 processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
                 starpu_mpi_shutdown();
                 starpu_shutdown();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
 	starpu_variable_data_register(&send_handle[0], 0, (uintptr_t)&send[0], sizeof(unsigned));
@@ -45,27 +48,34 @@ int main(int argc, char **argv)
 	starpu_variable_data_register(&recv_handle[0], 0, (uintptr_t)&recv[0], sizeof(unsigned));
 	starpu_variable_data_register(&recv_handle[1], 0, (uintptr_t)&recv[1], sizeof(unsigned));
 
-        if (rank == 0) {
+        if (rank == 0)
+	{
                 starpu_mpi_isend(send_handle[0], &(req[0]), 1, 12, MPI_COMM_WORLD);
                 starpu_mpi_isend(send_handle[1], &(req[1]), 1, 13, MPI_COMM_WORLD);
         }
-        else if (rank == 1) {
+        else if (rank == 1)
+	{
                 starpu_mpi_irecv(recv_handle[0], &(req[0]), 0, 12, MPI_COMM_WORLD);
                 starpu_mpi_irecv(recv_handle[1], &(req[1]), 0, 13, MPI_COMM_WORLD);
         }
 
-        if (rank == 0 || rank == 1) {
+        if (rank == 0 || rank == 1)
+	{
                 int nb_req=2;
-                while (nb_req) {
+                while (nb_req)
+		{
                         int r=0;
-                        for(r=0 ; r<2 ; r++) {
-                                if (req[r]) {
+                        for(r=0 ; r<2 ; r++)
+			{
+                                if (req[r])
+				{
                                         int finished = 0;
                                         MPI_Status status;
                                         starpu_mpi_test(&req[r], &finished, &status);
                                         STARPU_ASSERT(finished != -1);
-                                        if(finished) {
-                                                fprintf(stderr, "[%d] Request %d finished\n", rank, r);
+                                        if (finished)
+					{
+                                                FPRINTF(stderr, "[%d] Request %d finished\n", rank, r);
                                                 req[r] = NULL;
                                                 nb_req--;
                                         }
@@ -73,7 +83,7 @@ int main(int argc, char **argv)
                         }
                 }
         }
-        fprintf(stderr, "[%d] All requests finished\n", rank);
+        FPRINTF(stderr, "[%d] All requests finished\n", rank);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();

+ 14 - 12
mpi/tests/pingpong.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,33 +16,35 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 #define SIZE	16
 
 float *tab;
-starpu_data_handle tab_handle;
+starpu_data_handle_t tab_handle;
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size != 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need exactly 2 processes.\n");
+			FPRINTF(stderr, "We need exactly 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	tab = malloc(SIZE*sizeof(float));
 
@@ -50,7 +52,6 @@ int main(int argc, char **argv)
 
 	unsigned nloops = NITER;
 	unsigned loop;
-
 	int other_rank = (rank + 1)%2;
 
 	for (loop = 0; loop < nloops; loop++)
@@ -59,12 +60,13 @@ int main(int argc, char **argv)
 		{
 			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
 		}
 	}
-	
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 25 - 22
mpi/tests/ring.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,12 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 
 unsigned token = 42;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
@@ -32,13 +33,15 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	(*tokenptr)++;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
-	.cpu_func = increment_cpu,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 void increment_token(void)
@@ -46,10 +49,7 @@ void increment_token(void)
 	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &increment_cl;
-	
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-
+	task->handles[0] = token_handle;
 	task->synchronous = 1;
 
 	starpu_task_submit(task);
@@ -57,24 +57,25 @@ void increment_token(void)
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least 2 processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 
@@ -91,22 +92,24 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		{
 			token = 0;
-			fprintf(stdout, "Start with token value %d\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_recv(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, &status);
 		}
 
 		increment_token();
-		
+
 		if (loop == last_loop && rank == last_rank)
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
-			fprintf(stdout, "Finished : token value %d\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 		}
-		else {
+		else
+		{
 			starpu_mpi_send(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD);
 		}
 	}

+ 22 - 20
mpi/tests/ring_async.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,12 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 
 unsigned token = 42;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
@@ -32,13 +33,15 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	(*tokenptr)++;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
-	.cpu_func = increment_cpu,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 void increment_token(void)
@@ -46,10 +49,7 @@ void increment_token(void)
 	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &increment_cl;
-	
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
-
+	task->handles[0] = token_handle;
 	task->synchronous = 1;
 
 	starpu_task_submit(task);
@@ -57,24 +57,25 @@ void increment_token(void)
 
 int main(int argc, char **argv)
 {
-	MPI_Init(NULL, NULL);
-
-	int rank, size;
+	int ret, rank, size;
 
+	MPI_Init(NULL, NULL);
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least 2 processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
-	starpu_init(NULL);
-	starpu_mpi_initialize();
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize();
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize");
 
 	starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(unsigned));
 
@@ -91,9 +92,10 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		{
 			token = 0;
-			fprintf(stdout, "Start with token value %d\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
-		else {
+		else
+		{
 			MPI_Status status;
 			starpu_mpi_req req;
 			starpu_mpi_irecv(token_handle, &req, (rank+size-1)%size, tag, MPI_COMM_WORLD);
@@ -105,7 +107,7 @@ int main(int argc, char **argv)
 		if (loop == last_loop && rank == last_rank)
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
-			fprintf(stdout, "Finished : token value %d\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 		}
 		else {

+ 22 - 17
mpi/tests/ring_async_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -16,11 +16,12 @@
  */
 
 #include <starpu_mpi.h>
+#include "helper.h"
 
 #define NITER	2048
 
 unsigned token = 42;
-starpu_data_handle token_handle;
+starpu_data_handle_t token_handle;
 
 #ifdef STARPU_USE_CUDA
 extern void increment_cuda(void *descr[], __attribute__ ((unused)) void *_args);
@@ -32,13 +33,15 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 	(*tokenptr)++;
 }
 
-static starpu_codelet increment_cl = {
+static struct starpu_codelet increment_cl =
+{
 	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
-	.cuda_func = increment_cuda,
+	.cuda_funcs = {increment_cuda, NULL},
 #endif
-	.cpu_func = increment_cpu,
-	.nbuffers = 1
+	.cpu_funcs = {increment_cpu, NULL},
+	.nbuffers = 1,
+	.modes = {STARPU_RW}
 };
 
 void increment_token(void)
@@ -46,15 +49,14 @@ void increment_token(void)
 	struct starpu_task *task = starpu_task_create();
 
 	task->cl = &increment_cl;
-	task->buffers[0].handle = token_handle;
-	task->buffers[0].mode = STARPU_RW;
+	task->handles[0] = token_handle;
 
 	starpu_task_submit(task);
 }
 
 int main(int argc, char **argv)
 {
-	int rank, size;
+	int ret, rank, size;
 
 #if 0
 	MPI_Init(NULL, NULL);
@@ -62,16 +64,18 @@ int main(int argc, char **argv)
 	MPI_Comm_size(MPI_COMM_WORLD, &size);
 #endif
 
-	starpu_init(NULL);
-	starpu_mpi_initialize_extended(&rank, &size);
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_initialize_extended(&rank, &size);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_initialize_extended");
 
 	if (size < 2)
 	{
 		if (rank == 0)
-			fprintf(stderr, "We need at least 2 processes.\n");
+			FPRINTF(stderr, "We need at least 2 processes.\n");
 
 		MPI_Finalize();
-		return 0;
+		return STARPU_TEST_SKIPPED;
 	}
 
 
@@ -90,7 +94,7 @@ int main(int argc, char **argv)
 		if (loop == 0 && rank == 0)
 		{
 			token = 0;
-			fprintf(stdout, "Start with token value %d\n", token);
+			FPRINTF(stdout, "Start with token value %d\n", token);
 		}
 		else
 		{
@@ -102,10 +106,11 @@ int main(int argc, char **argv)
 		if (loop == last_loop && rank == last_rank)
 		{
 			starpu_data_acquire(token_handle, STARPU_R);
-			fprintf(stdout, "Finished : token value %d\n", token);
+			FPRINTF(stdout, "Finished : token value %d\n", token);
 			starpu_data_release(token_handle);
 		}
-		else {
+		else
+		{
 			starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
 		}
 	}
@@ -119,7 +124,7 @@ int main(int argc, char **argv)
 
 	if (rank == last_rank)
 	{
-                fprintf(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
+                FPRINTF(stderr, "[%d] token = %d == %d * %d ?\n", rank, token, nloops, size);
                 STARPU_ASSERT(token == nloops*size);
 	}
 

+ 10 - 2
socl/Makefile.am

@@ -13,11 +13,11 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-SUBDIRS = src
+SUBDIRS = src examples
 
 EXTRA_DIST = README
 
-libsocl_la_includedir=$(includedir)/starpu/CL
+libsocl_la_includedir=$(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)/socl/CL
 
 libsocl_la_include_HEADERS = \
   include/CL/cl.h \
@@ -28,3 +28,11 @@ libsocl_la_include_HEADERS = \
   include/CL/cl_starpu.h \
   include/CL/opencl.h \
   include/CL/cl.hpp
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = socl-1.0.pc
+
+showcheck:
+	for i in $(SUBDIRS) ; do \
+		make -C $$i showcheck ; \
+	done

+ 67 - 0
socl/examples/Makefile.am

@@ -0,0 +1,67 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la
+AM_CPPFLAGS = -I$(top_srcdir)/socl/include/ 
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+
+
+SOCL_EXAMPLES	=
+TESTS		=	$(SOCL_EXAMPLES)
+
+if STARPU_HAVE_WINDOWS
+check_PROGRAMS	=	$(SOCL_EXAMPLES)
+else
+check_PROGRAMS	=	$(LOADER) $(SOCL_EXAMPLES)
+endif
+
+if !STARPU_HAVE_WINDOWS
+## test loader program
+LOADER			=	loader
+loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+LOADER_BIN		=	$(abs_top_builddir)/socl/examples/$(LOADER)
+loader_SOURCES		=	../../tests/loader.c
+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" $(LOADER_BIN)
+endif
+
+examplebindir = $(libdir)/starpu/examples/socl/
+examplebin_PROGRAMS =
+
+examplebin_PROGRAMS +=		\
+	basic/basic		\
+	clinfo/clinfo
+
+#	mandelbrot/mandelbrot
+
+SOCL_EXAMPLES +=		\
+	basic/basic		\
+	clinfo/clinfo
+
+#	mandelbrot/mandelbrot
+
+basic_basic_SOURCES = basic/basic.c
+clinfo_clinfo_SOURCES = clinfo/clinfo.c
+#mandelbrot_mandelbrot_SOURCES = mandelbrot/mandelbrot.c
+
+#mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
+#if HAVE_X11
+#mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
+#mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) -lX11 $(X_LIBS) $(X_EXTRA_LIBS)
+#endif
+
+showcheck:
+	-cat $(TEST_LOGS) /dev/null

+ 211 - 0
socl/examples/basic/basic.c

@@ -0,0 +1,211 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <CL/cl.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
+
+#ifdef UNUSED
+#elif defined(__GNUC__)
+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
+#else
+# define UNUSED(x) x
+#endif
+
+#define SIZE 1024
+#define TYPE float
+#define REALSIZE (SIZE * sizeof(TYPE))
+
+const char * kernel_src = "__kernel void add(__global float*s1, __global float*s2, __global float*d) { \
+   size_t x = get_global_id(0);\
+   size_t y = get_global_id(1);\
+   size_t w = get_global_size(0); \
+   int idx = y*w+x; \
+   d[idx] = s1[idx] + s2[idx];\
+}";
+
+
+
+int main(int UNUSED(argc), char** UNUSED(argv)) {
+   cl_platform_id platforms[15];
+   cl_uint num_platforms;
+   cl_device_id devices[15];
+   cl_uint num_devices;
+   cl_context context;
+   cl_program program;
+   cl_kernel kernel;
+   cl_mem s1m, s2m, dm;
+   cl_command_queue cq;
+   cl_int err;
+
+   TYPE s1[SIZE],s2[SIZE],d[SIZE];
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+         s1[i] = 2.0;
+         s2[i] = 7.0;
+         d[i] = 98.0;
+      }
+   }
+
+   printf("Querying platform...\n");
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   if (num_platforms == 0) {
+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+      exit(77);
+   }
+   err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
+   check(err, "clGetPlatformIDs");
+
+   printf("Querying devices...\n");
+   unsigned int platform_idx;
+   for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
+      err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
+      check(err, "clGetDeviceIDs");
+      if (num_devices != 0)
+         break;
+   }
+   if (num_devices == 0)
+      error("No OpenCL device found\n");
+
+   printf("Creating context...\n");
+   cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
+   context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
+   check(err, "clCreateContext");
+
+   printf("Creating program...\n");
+   program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
+   check(err, "clCreateProgram");
+
+   printf("Building program...\n");
+   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+   check(err, "clBuildProgram");
+
+   printf("Creating kernel...\n");
+   kernel = clCreateKernel(program, "add", &err);
+   check(err, "clCreateKernel");
+
+   printf("Creating buffers...\n");
+   s1m = clCreateBuffer(context, CL_MEM_READ_WRITE, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s1");
+   s2m = clCreateBuffer(context, CL_MEM_READ_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer s2");
+   dm = clCreateBuffer(context, CL_MEM_WRITE_ONLY, REALSIZE, NULL, &err);
+   check(err, "clCreateBuffer d");
+
+   printf("Creating command queue...\n");
+   cl_event eventW1, eventW2, eventK, eventR;
+
+#ifdef PROFILING
+   cq = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, &err);
+#else
+   cq = clCreateCommandQueue(context, devices[0], 0, &err);
+#endif
+   check(err, "clCreateCommandQueue");
+
+   printf("Enqueueing WriteBuffers...\n");
+   err = clEnqueueWriteBuffer(cq, s1m, CL_FALSE, 0, REALSIZE, s1, 0, NULL, &eventW1);
+   check(err, "clEnqueueWriteBuffer s1");
+   err = clEnqueueWriteBuffer(cq, s2m, CL_FALSE, 0, REALSIZE, s2, 0, NULL, &eventW2);
+   check(err, "clEnqueueWriteBuffer s2");
+
+   printf("Setting kernel arguments...\n");
+   err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &s1m);
+   check(err, "clSetKernelArg 0");
+   err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &s2m);
+   check(err, "clSetKernelArg 1");
+   err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dm);
+   check(err, "clSetKernelArg 2");
+
+   printf("Enqueueing NDRangeKernel...\n");
+   size_t local[3] = {16, 1, 1};
+   size_t global[3] = {1024, 1, 1};
+   cl_event deps[] = {eventW1,eventW2};
+   err = clEnqueueNDRangeKernel(cq, kernel, 3, NULL, global, local, 2, deps, &eventK);
+   check(err, "clEnqueueNDRangeKernel");
+
+   printf("Enqueueing ReadBuffer...\n");
+   err = clEnqueueReadBuffer(cq, dm, CL_FALSE, 0, REALSIZE, d, 0, NULL, &eventR);
+   check(err, "clEnqueueReadBuffer");
+
+   clFinish(cq);
+
+   {
+      int i;
+      for (i=0; i<SIZE; i++) {
+        printf("%f ", d[i]);
+      }
+      printf("\n");
+   }
+
+#ifdef PROFILING
+   #define DURATION(event,label) do { \
+      cl_ulong t0,t1; \
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &t0, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
+      check(err, "clGetEventProfilingInfo");\
+      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
+   } while (0);
+
+   DURATION(eventW1, "first buffer writing");
+   DURATION(eventW2, "second buffer writing");
+   DURATION(eventK, "kernel execution");
+   DURATION(eventR, "result buffer reading");
+#endif
+
+   
+   printf("Releasing events...\n");
+   err = clReleaseEvent(eventW1);
+   err |= clReleaseEvent(eventW2);
+   err |= clReleaseEvent(eventK);
+   err |= clReleaseEvent(eventR);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing command queue...\n");
+   err = clReleaseCommandQueue(cq);
+   check(err, "clReleaseCommandQueue");
+
+   printf("Releasing buffers...\n");
+   err = clReleaseMemObject(s1m);
+   check(err, "clReleaseMemObject s1");
+   err = clReleaseMemObject(s2m);
+   check(err, "clReleaseMemObject s2");
+   err = clReleaseMemObject(dm);
+   check(err, "clReleaseMemObject d");
+
+   printf("Releasing kernel...\n");
+   err = clReleaseKernel(kernel);
+   check(err, "clReleaseKernel");
+
+   printf("Releasing program...\n");
+   err = clReleaseProgram(program);
+   check(err, "clReleaseProgram");
+
+   printf("Releasing context...\n");
+   err = clReleaseContext(context);
+   check(err, "clReleaseContext");
+
+   return 0;
+}

+ 302 - 0
socl/examples/clinfo/clinfo.c

@@ -0,0 +1,302 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <CL/cl.h>
+
+static inline void 
+checkErr(cl_int err, const char * name) {
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "ERROR: %s (%d)\n", name, err);
+        exit(1);
+    }
+}
+
+int
+main(void) {
+   cl_int err;
+   cl_uint num_platforms;
+   cl_platform_id *platforms;
+
+   // Plaform info
+   err = clGetPlatformIDs(0, NULL, &num_platforms);
+   if (num_platforms == 0) {
+      printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+      exit(77);
+   }
+   checkErr(err, "Unable to get platform count");
+
+   platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms);
+   err = clGetPlatformIDs(num_platforms, platforms, NULL);
+   checkErr(err, "Unable to get platform list");
+   
+   
+   // Iteratate over platforms
+   printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
+
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_PROFILE)");
+         printf("  Plaform Profile:\t\t\t\t %s\n", str);    
+
+         err= clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VERSION)");
+         printf("  Plaform Version:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_VENDOR)");
+         printf("  Plaform Vendor:\t\t\t\t %s\n", str);    
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_EXTENSIONS)");
+         printf("  Plaform Extensions:\t\t\t %s\n", str);    
+      }
+   }
+
+   printf("\n\n");
+
+   // Now Iteratate over each platform and its devices
+   {
+      unsigned int i;
+      for (i=0; i<num_platforms; i++) {
+         char str[256];
+         cl_device_id * devices;
+         cl_uint num_devices;
+
+         err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(str), &str, NULL);
+         checkErr(err, "clGetPlatformInfo(CL_PLATFORM_NAME)");
+         printf("  Plaform Name:\t\t\t\t\t %s\n", str);    
+
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+         devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
+         
+         err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+         checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
+
+         printf("  Number of devices:\t\t\t\t %d\n", num_devices);
+         {
+            unsigned int j;
+            for (j=0; j<num_devices; j++) {
+               cl_device_type dev_type;
+               printf("\n  DEVICE %d\n", j);
+               
+               err = clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
+               checkErr(err, "clGetDeviceInfo(CL_DEVICE_TYPE)");
+
+               printf("  Device Type:\t\t\t\t\t ");
+               if (dev_type & CL_DEVICE_TYPE_ACCELERATOR)
+                  printf("CL_DEVICE_TYPE_ACCELERATOR ");
+               else if (dev_type & CL_DEVICE_TYPE_CPU)
+                  printf("CL_DEVICE_TYPE_CPU ");
+               else if (dev_type & CL_DEVICE_TYPE_GPU)
+                  printf("CL_DEVICE_TYPE_GPU ");
+               else if (dev_type & CL_DEVICE_TYPE_DEFAULT)
+                  printf("CL_DEVICE_TYPE_DEFAULT ");
+
+               printf("\n");
+
+               {
+                  cl_uint vendor_id;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR_ID, sizeof(vendor_id), &vendor_id, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_VENDOR_ID)");
+                  printf("  Device ID:\t\t\t\t\t %d\n", vendor_id); 
+               }
+               {
+                  cl_uint units;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(units), &units, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_COMPUTE_UNITS)");
+                  printf("  Max compute units:\t\t\t\t %d\n", units); 
+               }
+
+               {
+                  cl_uint dims;
+                  size_t *sizes;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(dims), &dims, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  sizes = (size_t*)malloc(dims * sizeof(size_t));
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*dims, sizes, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES)");
+                  printf("  Max work item dimensions:\t\t\t %d\n", dims); 
+
+                  {
+                     unsigned int k;
+                     printf("    Max work items:\t\t\t\t (");
+                     for (k=0; k<dims; k++) {
+                        printf("%u", (unsigned int)sizes[k]);
+                        if (k != dims-1)
+                           printf(",");
+                     }
+                     printf(")\n");
+                  }
+               }
+
+#define GET_SIZET(CL_D,str) { \
+   size_t val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (unsigned int)val); \
+}
+
+#define GET_STRING(CL_D,str,size) { \
+   char val[size]; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_UINT(CL_D,str) { \
+   cl_uint val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_ULONG(CL_D,str) { \
+   cl_ulong val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, val); \
+}
+
+#define GET_BOOL(CL_D,str) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? "Yes" : "No")); \
+}
+
+#define GET_BOOL_CUSTOM(CL_D,str,t,f) { \
+   cl_bool val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, (val == CL_TRUE ? t : f)); \
+}
+
+#define GET_BITSET_AND(TYPE,CL_D,test,str) { \
+   TYPE val; \
+   err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
+   checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
+   printf(str, ((val & test) == CL_TRUE ? "Yes" : "No")); \
+}
+      
+               GET_SIZET(CL_DEVICE_MAX_WORK_GROUP_SIZE, "  Max work group size:\t\t\t\t %u\n")
+               
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "  Preferred vector width char:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "  Preferred vector width short:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "  Preferred vector width int:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "  Preferred vector width long:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "  Preferred vector width float:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
+               GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
+               GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
+
+               GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
+
+               GET_SIZET(CL_DEVICE_MAX_PARAMETER_SIZE, "  Max size of kernel argument:\t\t\t %u\n")
+               GET_UINT(CL_DEVICE_MEM_BASE_ADDR_ALIGN, "  Alignment of base addres:\t\t\t %u bits\n")
+               GET_UINT(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "  Minimum alignment for any datatype:\t\t %u bytes\n")
+
+               printf("  Single precision floating point capability\n");
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_DENORM, "    Denorms:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_INF_NAN, "    Quiet NaNs:\t\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_NEAREST, "    Round to nearest even:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_ZERO, "    Round to zero:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_ROUND_TO_INF, "    Round to +ve and infinity:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_fp_config,CL_DEVICE_SINGLE_FP_CONFIG, CL_FP_FMA, "    IEEE754-2008 fused multiply-add:\t\t %s\n")
+
+               {
+                  cl_device_mem_cache_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE)");
+                  printf("  Cache type:\t\t\t\t\t ");
+                  switch (cache) {
+                     case CL_NONE:
+                        printf("None\n");
+                        break;
+                     case CL_READ_ONLY_CACHE:
+                        printf("Read only\n");
+                        break;
+                     case CL_READ_WRITE_CACHE:
+                        printf("Read/Write\n");
+                        break;
+                  }
+               }
+
+               GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
+               GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
+
+               {
+                  cl_device_local_mem_type cache;
+                  err = clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cache), &cache, NULL);
+                  checkErr(err, "clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_TYPE)");
+                  printf("  Local memory type:\t\t\t\t ");
+                  switch (cache) {
+                     case CL_LOCAL:
+                        printf("Local\n");
+                        break;
+                     case CL_GLOBAL:
+                        printf("Global\n");
+                        break;
+                  }
+               }
+
+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
+               GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
+               GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
+               GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")
+               GET_BOOL(CL_DEVICE_COMPILER_AVAILABLE, "  Compiler available:\t\t\t\t %s\n")
+
+               printf("  Execution capabilities:\t\t\t\t \n");
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_KERNEL, "  Execute OpenCL kernels:\t\t\t %s\n")
+               GET_BITSET_AND(cl_device_exec_capabilities, CL_DEVICE_EXECUTION_CAPABILITIES, CL_EXEC_NATIVE_KERNEL, "  Execute native kernels:\t\t\t %s\n")
+
+               printf("  Queue properties:\t\t\t\t\n ");
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, "   Out-of-Order:\t\t\t\t %s\n")
+               GET_BITSET_AND(cl_command_queue_properties, CL_DEVICE_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, "    Profiling:\t\t\t\t\t %s\n")
+
+
+               GET_STRING(CL_DEVICE_NAME, "  Name:\t\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DEVICE_VENDOR, "  Vendor:\t\t\t\t\t %s\n", 256);
+               GET_STRING(CL_DRIVER_VERSION, "  Driver version:\t\t\t\t %s\n", 10);
+               GET_STRING(CL_DEVICE_PROFILE, "  Profile:\t\t\t\t\t %s\n", 30);
+               GET_STRING(CL_DEVICE_VERSION, "  Version:\t\t\t\t\t %s\n", 50);
+               GET_STRING(CL_DEVICE_EXTENSIONS, "  Extensions:\t\t\t\t\t %s\n", 4096);
+            
+               printf("\n");
+            }
+         }
+      }
+   }
+
+   return 0;
+}

+ 507 - 0
socl/examples/mandelbrot/mandelbrot.c

@@ -0,0 +1,507 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010,2011 University of Bordeaux
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Uncomment this to activate X11 display */
+//#define USE_X11
+
+#define SHORT_LOG 1
+#define ROUND_ROBIN
+
+#ifdef USE_X11
+#include <X11/Xlib.h>
+#include <X11/Xutil.h>
+int use_x11 = 1;
+#else
+int use_x11 = 0;
+#endif
+
+int demo = 0;
+int frames = -1;
+
+
+#include <pthread.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <CL/cl.h>
+
+#define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
+#define check(err, str) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): %s\n",err, str); exit(EXIT_FAILURE); }} while(0)
+
+#ifdef UNUSED
+#elif defined(__GNUC__)
+# define UNUSED(x) UNUSED_ ## x __attribute__((unused))
+#else
+# define UNUSED(x) x
+#endif
+
+const char * kernel_src = "\
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n\
+#define TYPE double \n\
+#define MIN(a,b) (((a)<(b))? (a) : (b))\n\
+      __kernel void mandelbrot_kernel(__global uint * a,\n\
+          TYPE leftX, TYPE topY,\n\
+          TYPE stepX, TYPE stepY,\n\
+          uint maxIt, uint iby, uint block_size)\n\
+{\n\
+  TYPE xc = leftX + get_global_id(0) * stepX;\n\
+  TYPE yc = iby*block_size*stepY + topY  + get_global_id(1) * stepY;\n\
+  int it;\n\
+  TYPE x,y;\n\
+  x = y = (TYPE)0.0;\n\
+  for (it=0;it<maxIt;it++)\n\
+  {\n\
+    TYPE x2 = x*x;\n\
+    TYPE y2 = y*y;\n\
+    if (x2+y2 > (TYPE)4) break; \n\
+    TYPE twoxy = (TYPE)2*x*y;\n\
+    x = x2 - y2 + xc;\n\
+    y = twoxy + yc;\n\
+  }\n\
+  uint v = MIN((1024*((float)(it)/(2000))), 256);\n\
+  a[get_global_id(0) + get_global_id(1)*get_global_size(0)] = (v<<16|(255-v)<<8); \n\
+}";
+
+static cl_uint nblocks = 8;
+static cl_uint height = 768;
+static cl_uint width = 1024;
+static cl_uint maxIt = 20000;
+
+static cl_uint group_size = 64;
+
+static double leftX = -0.745;
+static double rightX = -0.74375;
+static double topY = .15;
+static double bottomY = .14875;
+
+#ifdef USE_X11
+      /* X11 data */
+      static Display *dpy;
+      static Window win;
+      static XImage *bitmap;
+      static GC gc;
+      static KeySym Left=-1, Right, Down, Up, Alt ;
+      static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void exit_x11(void)
+{
+  XDestroyImage(bitmap);
+  XDestroyWindow(dpy, win);
+  XCloseDisplay(dpy);
+}
+
+static void init_x11(int width, int height, cl_uint *buffer)
+{
+  /* Attempt to open the display */
+  dpy = XOpenDisplay(NULL);
+
+  /* Failure */
+  if (!dpy)
+    exit(0);
+
+  unsigned long white = WhitePixel(dpy,DefaultScreen(dpy));
+  unsigned long black = BlackPixel(dpy,DefaultScreen(dpy));
+
+  win = XCreateSimpleWindow(dpy, DefaultRootWindow(dpy), 0, 0,
+      width, height, 0, black, white);
+
+  /* We want to be notified when the window appears */
+  XSelectInput(dpy, win, StructureNotifyMask);
+
+  /* Make it appear */
+  XMapWindow(dpy, win);
+
+  XTextProperty tp;
+  char name[128] = "Mandelbrot";
+  char *n = name;
+  Status st = XStringListToTextProperty(&n, 1, &tp);
+  if (st)
+    XSetWMName(dpy, win, &tp);
+
+  /* Wait for the MapNotify event */
+  XFlush(dpy);
+
+  int depth = DefaultDepth(dpy, DefaultScreen(dpy));
+  Visual *visual = DefaultVisual(dpy, DefaultScreen(dpy));
+
+  /* Make bitmap */
+  bitmap = XCreateImage(dpy, visual, depth,
+      ZPixmap, 0, (char *)buffer,
+      width, height, 32, 0);
+
+  /* Init GC */
+  gc = XCreateGC(dpy, win, 0, NULL);
+  XSetForeground(dpy, gc, black);
+
+  XSelectInput(dpy, win, ExposureMask | KeyPressMask | StructureNotifyMask);
+
+  Atom wmDeleteMessage;
+  wmDeleteMessage = XInternAtom(dpy, "WM_DELETE_WINDOW", False);
+  XSetWMProtocols(dpy, win, &wmDeleteMessage, 1);
+
+  Left = XStringToKeysym ("Left");
+  Right = XStringToKeysym ("Right");
+  Up = XStringToKeysym ("Up");
+  Down = XStringToKeysym ("Down");
+  Alt = XStringToKeysym ("Alt");
+}
+
+static int handle_events(void)
+{
+  XEvent event;
+  XNextEvent(dpy, &event);
+
+  KeySym key;
+  char text[255];
+
+  double coef = 0.05;
+
+  if (event.type == KeyPress)
+  {
+    XLookupString(&event.xkey,text,255,&key,0);
+    if (key == Left)
+    {
+      double widthX = rightX - leftX;
+      leftX -= coef*widthX;
+      rightX -= coef*widthX;
+    }
+    else if (key == Right)
+    {
+      double widthX = rightX - leftX;
+      leftX += coef*widthX;
+      rightX += coef*widthX;
+    }
+    else if (key == Down)
+    {
+      double heightY = topY - bottomY;
+      topY += coef*heightY;
+      bottomY += coef*heightY;
+    }
+    else if (key == Up)
+    {
+      double heightY = topY - bottomY;
+      topY -= coef*heightY;
+      bottomY -= coef*heightY;
+    }
+    else {
+      double widthX = rightX - leftX;
+      double heightY = topY - bottomY;
+
+      if (text[0] == '-')
+      {
+        /* Zoom out */
+        leftX -= (coef/2)*widthX;
+        rightX += (coef/2)*widthX;
+        topY += (coef/2)*heightY;
+        bottomY -= (coef/2)*heightY;
+      }
+      else if (text[0] == '+')
+      {
+        /* Zoom in */
+        leftX += (coef/2)*widthX;
+        rightX -= (coef/2)*widthX;
+        topY -= (coef/2)*heightY;
+        bottomY += (coef/2)*heightY;
+      }
+    }
+
+    if (text[0]=='q') {
+      return -1;
+    }
+  }
+
+  if (event.type==ButtonPress) {
+    /* tell where the mouse Button was Pressed */
+    printf("You pressed a button at (%i,%i)\n",
+        event.xbutton.x,event.xbutton.y);
+  }
+
+  return 0;
+}
+#endif //USE_X11
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-h") == 0) {
+			fprintf(stderr, "Usage: %s [-h] [ -width 1024] [-height 768] [-nblocks 16] [-group_size 64] [-no-x11] [-demo] [-frames N] [-pos leftx:rightx:bottomy:topy]\n", argv[0]);
+			exit(-1);
+		}
+
+		if (strcmp(argv[i], "-width") == 0) {
+			char *argptr;
+			width = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-frames") == 0) {
+			char *argptr;
+			frames = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-height") == 0) {
+			char *argptr;
+			height = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-group_size") == 0) {
+			char *argptr;
+			group_size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-pos") == 0) {
+			int ret = sscanf(argv[++i], "%lf:%lf:%lf:%lf", &leftX, &rightX, &bottomY, &topY);
+			assert(ret == 4);
+		}
+
+		if (strcmp(argv[i], "-demo") == 0) {
+			demo = 1;
+			leftX = -50.22749575062760;
+			rightX = 48.73874621262927;
+			topY = -49.35016705749115;
+			bottomY = 49.64891691946615;
+
+		}
+
+		if (strcmp(argv[i], "-no-x11") == 0) {
+#ifdef USE_X11
+			use_x11 = 0;
+#endif
+		}
+	}
+}
+
+int main(int argc, char **argv) {
+#define MAX_DEVICES 20
+  cl_platform_id platforms[15];
+  cl_uint num_platforms;
+  cl_device_id devices[15];
+  cl_uint num_devices;
+  cl_context context;
+  cl_program program;
+  cl_kernel kernel;
+  cl_command_queue cq[MAX_DEVICES];
+  cl_int err;
+  cl_uint i;
+
+  parse_args(argc, argv);
+
+  cl_uint block_size = height/nblocks;
+  assert((height % nblocks) == 0);
+  assert((width % group_size) == 0);
+
+  err = clGetPlatformIDs(0, NULL, &num_platforms);
+  if (num_platforms == 0) {
+    printf("No OpenCL platform found. If you use SOCL, this could mean StarPU wasn't configured for OpenCL. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
+    exit(0);
+  }
+  err = clGetPlatformIDs(sizeof(platforms)/sizeof(cl_platform_id), platforms, NULL);
+  check(err, "clGetPlatformIDs");
+
+  unsigned int platform_idx;
+  for (platform_idx=0; platform_idx<num_platforms; platform_idx++) {
+    err = clGetDeviceIDs(platforms[platform_idx], CL_DEVICE_TYPE_GPU, sizeof(devices)/sizeof(cl_device_id), devices, &num_devices);
+    check(err, "clGetDeviceIDs");
+    if (num_devices != 0)
+      break;
+  }
+  if (num_devices == 0)
+    error("No OpenCL device found\n");
+
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
+  context = clCreateContext(properties, num_devices, devices, NULL, NULL, &err);
+  check(err, "clCreateContext");
+
+  program = clCreateProgramWithSource(context, 1, &kernel_src, NULL, &err);
+  check(err, "clCreateProgram");
+
+  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+  check(err, "clBuildProgram");
+
+  kernel = clCreateKernel(program, "mandelbrot_kernel", &err);
+  check(err, "clCreateKernel");
+
+
+  for (i=0; i<num_devices; i++)
+    cq[i] = clCreateCommandQueue(context, devices[i],  CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+  check(err, "clCreateCommandQueue");
+
+  cl_uint *buffer;
+  buffer = malloc(height*width*sizeof(cl_uint));
+
+#ifdef USE_X11
+  if (use_x11)
+    init_x11(width, height, buffer);
+#endif // USE_X11
+
+
+
+  cl_mem block_handles[nblocks];
+
+  cl_uint iby;
+
+  for (iby = 0; iby < nblocks; iby++) {
+    cl_uint *data = &buffer[iby*block_size*width];
+    block_handles[iby] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, block_size*width*sizeof(cl_uint), data, &err);
+  }
+
+  int stop = 0;
+  int frame = 0;
+
+  while (!stop) {
+    struct timeval start, end;
+    gettimeofday(&start, NULL);
+
+    if (frames != -1) {
+      frame++;
+      stop = (frame == frames);
+    }
+
+    double stepX = (rightX - leftX)/width;
+    double stepY = (topY - bottomY)/height;
+    cl_event ker_events[nblocks];
+    void * ptrs[nblocks];
+
+    for (iby = 0; iby < nblocks; iby++) {
+      err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &block_handles[iby]);
+      check(err, "clSetKernelArg out");
+      err = clSetKernelArg(kernel, 1, sizeof(cl_double), &leftX);
+      check(err, "clSetKernelArg leftX");
+      err = clSetKernelArg(kernel, 2, sizeof(cl_double), &topY);
+      check(err, "clSetKernelArg topY");
+      err = clSetKernelArg(kernel, 3, sizeof(cl_double), &stepX);
+      check(err, "clSetKernelArg leftX");
+      err = clSetKernelArg(kernel, 4, sizeof(cl_double), &stepY);
+      check(err, "clSetKernelArg topY");
+      err = clSetKernelArg(kernel, 5, sizeof(cl_uint), &maxIt);
+      check(err, "clSetKernelArg maxIt");
+      err = clSetKernelArg(kernel, 6, sizeof(cl_uint), &iby);
+      check(err, "clSetKernelArg iby");
+      err = clSetKernelArg(kernel, 7, sizeof(cl_uint), &block_size);
+      check(err, "clSetKernelArg block_size");
+
+      size_t local[3] = {group_size, 1, 1};
+      size_t global[3] = {width, block_size, 1};
+#ifdef ROUND_ROBIN
+      int dev = iby % num_devices;
+#else
+      int dev = 0;
+#endif
+      err = clEnqueueNDRangeKernel(cq[dev], kernel, 3, NULL, global, local, 0, NULL, &ker_events[iby]);
+      check(err, "clEnqueueNDRangeKernel");
+    }
+
+    for (iby = 0; iby < nblocks; iby++) {
+#ifdef ROUND_ROBIN
+      int dev = iby % num_devices;
+#else
+      int dev = 0;
+#endif
+      ptrs[iby] = clEnqueueMapBuffer(cq[dev], block_handles[iby], CL_FALSE,CL_MAP_READ, 0, block_size*width*sizeof(cl_uint), 1, &ker_events[iby], NULL, NULL);
+    }
+
+#ifdef ROUND_ROBIN
+    for (i = 0; i < num_devices; i++)
+      clFinish(cq[i]);
+#else
+    clFinish(cq[0]);
+#endif
+
+    gettimeofday(&end, NULL);
+    double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+#ifdef SHORT_LOG
+    fprintf(stderr, "%f\n", timing/1000.0);
+#else
+    fprintf(stderr, "Time to generate frame : %f ms\n", timing/1000.0);
+    fprintf(stderr, "%14.14f:%14.14f:%14.14f:%14.14f\n", leftX, rightX, bottomY, topY);
+#endif
+
+#ifdef USE_X11
+    if (use_x11) {
+      for (iby = 0; iby < nblocks; iby++) {
+        pthread_mutex_lock(&mutex);
+        XPutImage(dpy, win, gc, bitmap,
+            0, iby*block_size,
+            0, iby*block_size,
+            width, block_size);
+        pthread_mutex_unlock(&mutex);
+      }
+    }
+#endif
+
+    for (iby = 0; iby < nblocks; iby++) {
+#ifdef ROUND_ROBIN
+      int dev = iby % num_devices;
+#else
+      int dev = 0;
+#endif
+      clEnqueueUnmapMemObject(cq[dev], block_handles[iby], ptrs[iby], 0, NULL, NULL);
+      clReleaseEvent(ker_events[iby]);
+    }
+
+
+
+    if (demo) {
+      /* Zoom in */
+      double zoom_factor = 0.05;
+      double widthX = rightX - leftX;
+      double heightY = topY - bottomY;
+      leftX += (zoom_factor/2)*widthX;
+      rightX -= (zoom_factor/2)*widthX;
+      topY -= (zoom_factor/2)*heightY;
+      bottomY += (zoom_factor/2)*heightY;
+    }
+    else {
+#ifdef USE_X11
+      if (use_x11) {
+        handle_events();
+      }
+#else
+      stop = 1;
+#endif
+    }
+  }
+
+#ifdef USE_X11
+  if (use_x11)
+    exit_x11();
+#endif
+
+  for (iby = 0; iby < nblocks; iby++) {
+    clReleaseMemObject(block_handles[iby]);
+  }
+
+  for (i=0; i<num_devices; i++)
+    clReleaseCommandQueue(cq[i]);
+
+  clReleaseKernel(kernel);
+  clReleaseProgram(program);
+  clReleaseContext(context);
+
+  return 0;
+}

+ 29 - 0
socl/socl-1.0.pc.in

@@ -0,0 +1,29 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: socl
+Description: offers OpenCL implementation on top of StarPU
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
+Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: starpu-1.0
+Requires.private:

+ 24 - 7
socl/src/Makefile.am

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2009-2012  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -14,16 +14,33 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-AM_CFLAGS = -Wall -Wextra
-LIBS = $(top_builddir)/src/libstarpu.la
+AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
+libsocl_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS)
+AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) -no-undefined
 
 SUBDIRS =
 
-lib_LTLIBRARIES = libsocl.la
+lib_LTLIBRARIES = libsocl-@STARPU_EFFECTIVE_VERSION@.la
 
-libsocl_la_SOURCES = 						\
+noinst_HEADERS =				\
+  command.h					\
+  command_list.h				\
+  command_queue.h				\
+  debug.h					\
+  devices.h					\
+  event.h					\
+  gc.h						\
+  getinfo.h					\
+  mem_objects.h					\
+  socl.h					\
+  task.h					\
+  util.h
+
+libsocl_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined			\
+  -version-info $(LIBSOCL_INTERFACE_CURRENT):$(LIBSOCL_INTERFACE_REVISION):$(LIBSOCL_INTERFACE_AGE)
+
+libsocl_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
   command.c \
   command_list.c \
   command_queue.c \

+ 3 - 3
socl/src/cl_enqueuecopybuffer.c

@@ -58,11 +58,11 @@ static void soclEnqueueCopyBuffer_cpu_task(void *descr[], void *args) {
    free(arg);
 }
 
-static starpu_codelet codelet_copybuffer = {
+static struct starpu_codelet codelet_copybuffer = {
    .where = STARPU_CPU | STARPU_OPENCL,
    .model = NULL,
-   .cpu_func = &soclEnqueueCopyBuffer_cpu_task,
-   .opencl_func = &soclEnqueueCopyBuffer_opencl_task,
+   .cpu_funcs = { &soclEnqueueCopyBuffer_cpu_task, NULL },
+   .opencl_funcs = { &soclEnqueueCopyBuffer_opencl_task, NULL },
    .nbuffers = 2
 };
 

+ 1 - 1
socl/src/cl_enqueuemapbuffer.c

@@ -26,7 +26,7 @@ static void mapbuffer_callback(void *args) {
 static void mapbuffer_task(void *args) {
 	command_map_buffer cmd = (command_map_buffer)args;
 
-	starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
+	enum starpu_access_mode mode = (cmd->map_flags == CL_MAP_READ ? STARPU_R : STARPU_RW);
 
 	starpu_data_acquire_cb(cmd->buffer->handle, mode, mapbuffer_callback, cmd);
 }

+ 2 - 2
socl/src/cl_enqueuendrangekernel.c

@@ -102,7 +102,7 @@ static void cleaning_task_callback(void *args) {
 	free(co);
 }
 
-static struct starpu_perfmodel_t perf_model = {
+static struct starpu_perfmodel perf_model = {
 	.type = STARPU_HISTORY_BASED,
 	.symbol = "perf_model"
 };
@@ -117,7 +117,7 @@ cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd) {
 	task->cl_arg = cmd;
 	task->cl_arg_size = sizeof(cmd);
 
-	starpu_codelet * codelet = cmd->codelet;
+	struct starpu_codelet * codelet = cmd->codelet;
 
 	/* We need to detect which parameters are OpenCL's memory objects and
 	 * we retrieve their corresponding StarPU buffers */

+ 3 - 3
socl/src/cl_enqueuereadbuffer.c

@@ -58,11 +58,11 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
    free(args);
 }
 
-static starpu_codelet codelet_readbuffer = {
+static struct starpu_codelet codelet_readbuffer = {
    .where = STARPU_OPENCL,
    .model = NULL,
-   .cpu_func = &soclEnqueueReadBuffer_cpu_task,
-   .opencl_func = &soclEnqueueReadBuffer_opencl_task,
+   .cpu_funcs = { &soclEnqueueReadBuffer_cpu_task, NULL },
+   .opencl_funcs = { &soclEnqueueReadBuffer_opencl_task, NULL },
    .nbuffers = 1
 };
 

+ 3 - 3
socl/src/cl_enqueuewritebuffer.c

@@ -60,11 +60,11 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
    free(args);
 }
 
-static starpu_codelet codelet_writebuffer = {
+static struct starpu_codelet codelet_writebuffer = {
    .where = STARPU_OPENCL,
    .model = NULL,
-   .cpu_func = &soclEnqueueWriteBuffer_cpu_task,
-   .opencl_func = &soclEnqueueWriteBuffer_opencl_task,
+   .cpu_funcs = { &soclEnqueueWriteBuffer_cpu_task, NULL },
+   .opencl_funcs = { &soclEnqueueWriteBuffer_opencl_task, NULL },
    .nbuffers = 1
 };
 

+ 0 - 1
socl/src/cl_finish.c

@@ -22,7 +22,6 @@ soclFinish(cl_command_queue cq) CL_API_SUFFIX__VERSION_1_0 {
 	command_marker cmd = command_barrier_create();
 
 	command_queue_enqueue(cq, cmd, 0, NULL);
-		cl_event ev = command_event_get(cmd);
 
 	MAY_BLOCK(CL_TRUE)
 

+ 2 - 2
socl/src/cl_getkernelworkgroupinfo.c

@@ -81,9 +81,9 @@ static void gkwgi_task2(void **UNUSED(desc), void *data) {
    }
 }
 
-static starpu_codelet gkwgi_codelet = {
+static struct starpu_codelet gkwgi_codelet = {
    .where = STARPU_OPENCL,
-   .opencl_func = gkwgi_task2,
+   .opencl_funcs = { gkwgi_task2, NULL },
    .nbuffers = 0,
    .model = NULL
 };

+ 3 - 3
socl/src/cl_getplatformids.c

@@ -25,11 +25,11 @@ soclGetPlatformIDs(cl_uint          num_entries,
                  cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0
 {
    if ((num_entries == 0 && platforms != NULL)
-      || (num_platforms == NULL && platforms == NULL))
-      return CL_INVALID_VALUE;
+       || (num_platforms == NULL && platforms == NULL))
+	   return CL_INVALID_VALUE;
 
    if (starpu_opencl_worker_get_count() == 0) {
-      DEBUG_ERROR("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).")
+      DEBUG_MSG("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n")
 
       if (num_platforms != NULL)
          *num_platforms = 0;

+ 3 - 3
socl/src/command.c

@@ -96,7 +96,7 @@ command_ndrange_kernel command_ndrange_kernel_create (
 		const size_t *   global_work_size,
 		const size_t *   local_work_size)
 {
-	command_ndrange_kernel cmd = malloc(sizeof(struct command_ndrange_kernel_t));
+	command_ndrange_kernel cmd = calloc(1, sizeof(struct command_ndrange_kernel_t));
 	command_init(cmd, CL_COMMAND_NDRANGE_KERNEL);
 
 	dupEntity(kernel);
@@ -106,8 +106,8 @@ command_ndrange_kernel command_ndrange_kernel_create (
 	nullOrDup(local_work_size, work_dim*sizeof(size_t));
 
    	/* Codelet */
-   	cmd->codelet = (starpu_codelet*)malloc(sizeof(starpu_codelet));
-	starpu_codelet * codelet = cmd->codelet;
+   	cmd->codelet = (struct starpu_codelet*)calloc(1, sizeof(struct starpu_codelet));
+	struct starpu_codelet * codelet = cmd->codelet;
 	codelet->where = STARPU_OPENCL;
 	codelet->power_model = NULL;
 	codelet->opencl_func = &soclEnqueueNDRangeKernel_task;

+ 1 - 1
socl/src/command.h

@@ -75,7 +75,7 @@ typedef struct command_ndrange_kernel_t {
 	size_t *	 arg_sizes;
 	enum kernel_arg_type * arg_types;
 	void **		 args;
-	starpu_codelet * codelet;
+	struct starpu_codelet * codelet;
 	cl_uint		 num_buffers;
 	cl_mem *	 buffers;
 } * command_ndrange_kernel;

+ 6 - 1
socl/src/init.c

@@ -23,9 +23,14 @@
  */
 __attribute__((constructor)) static void socl_init() {
   
+  struct starpu_conf conf;
+  starpu_conf_init(&conf);
+  conf.ncuda = 0;
+  putenv("STARPU_NCUDA=0");
+
   mem_object_init();
 
-  starpu_init(NULL);
+  starpu_init(&conf);
   
   /* Disable dataflow implicit dependencies */
   starpu_data_set_default_sequential_consistency_flag(0);

+ 1 - 1
socl/src/socl.h

@@ -186,7 +186,7 @@ struct _cl_mem {
   CL_ENTITY;
 
   /* StarPU handle */
-  starpu_data_handle handle;
+  starpu_data_handle_t handle;
 
   /* Pointer to data in host memory */
   void *ptr;    

+ 3 - 3
socl/src/task.c

@@ -19,7 +19,7 @@
 #include "event.h"
 
 static void task_release_callback(void *arg) {
-  starpu_task task = starpu_get_current_task();
+  starpu_task task = starpu_task_get_current();
   cl_command cmd = (cl_command)arg;
   
   cl_event ev = command_event_get(cmd);
@@ -126,10 +126,10 @@ static void cputask_task(__attribute__((unused)) void *descr[], void *args) {
   free(arg);
 }
 
-static starpu_codelet cputask_codelet = {
+static struct starpu_codelet cputask_codelet = {
    .where = STARPU_CPU,
    .model = NULL,
-   .cpu_func = &cputask_task
+   .cpu_funcs = { &cputask_task, NULL }
 };
 
 starpu_task task_create_cpu(void (*callback)(void*), void *arg, int free_arg) {