12 years ago · ea45749090
--- a/.gitignore
+++ b/.gitignore
@@ -23,7 +23,7 @@ Makefile.in
 
				 .dirstamp
			
 
				 stamp-h[0-9]*
			
 
				 starpu.log
			
 
				-/gcc-plugin/include/starpu-gcc/config.h
			
 
				+/gcc-plugin/src/starpu-gcc-config.h
			
 
				 /gcc-plugin/tests/*.c.[0-9]*.*
			
 
				 /tests/datawizard/handle_to_pointer
			
 
				 /tests/datawizard/data_lookup
			
@@ -287,5 +287,3 @@ starpu.log
 
				 /tests/datawizard/interfaces/copy_interfaces
			
 
				 /gcc-plugin/tests/release
			
 
				 /gcc-plugin/tests/opencl
			
 
				-/gcc-plugin/tests/registered
			
 
				-/gcc-plugin/tests/warn-unregistered
			
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,6 +6,7 @@ Nathalie Furmento <nathalie.furmento@labri.fr>
 
				 Sylvain Henry <sylvain.henry@inria.fr>
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 Cyril Roelandt <cyril.roelandt@inria.fr>
			
 
				 =======
			
 
				 Cyril Roélandt <cyril.roelandt@inria.fr>
			
@@ -13,6 +14,9 @@ Cyril Roélandt <cyril.roelandt@inria.fr>
 
				 Cyril Roelandt <cyril.roelandt@inria.fr>
			
 
				 >>>>>>> .merge-right.r7640
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+Cyril Roélandt <cyril.roelandt@inria.fr>
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 François Tessier <francois.tessier@inria.fr>
			
 
				 Samuel Thibault <samuel.thibault@labri.fr>
			
 
				 Pierre André Wacrenier <wacrenier@labri.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -19,89 +19,6 @@ StarPU 1.1.0 (svn revision xxxx)
 
				 
			
 
				 New features:
			
 
				   * OpenGL interoperability support.
			
 
				-  * Capability to store compiled OpenCL kernels on the file system
			
 
				-  * Capability to load compiled OpenCL kernels
			
 
				-  * Performance models measurements can now be provided explicitly by
			
 
				-    applications.
			
 
				-  * Capability to emit communication statistics when running MPI code
			
 
				-  * Add starpu_unregister_submit, starpu_data_acquire_on_node and
			
 
				-    starpu_data_invalidate_submit
			
 
				-  * New functionnality to wrapper starpu_insert_task to pass a array of
			
 
				-	data_handles via the parameter STARPU_DATA_ARRAY
			
 
				-  * Enable GPU-GPU direct transfers.
			
 
				-  * GCC plug-in
			
 
				-	- Add `registered' attribute
			
 
				-	- A new pass was added that warns about the use of possibly
			
 
				-	  unregistered memory buffers.
			
 
				-  * SOCL
			
 
				-        - Manual mapping of commands on specific devices is now possible
			
 
				-  * New interface: COO matrix.
			
 
				-
			
 
				-Changes:
			
 
				-  * Fix the block filter functions.
			
 
				-  * Fix StarPU-MPI on Darwin.
			
 
				-  * The FxT code can now be used on systems other than Linux.
			
 
				-  * Keep only one hashtable implementation common/uthash.h
			
 
				-  * The cache of starpu_mpi_insert_task is fixed and thus now enabled by
			
 
				-    default.
			
 
				-  * Improve starpu_machine_display output.
			
 
				-  * Standardize objects name in the performance model API
			
 
				-  * SOCL
			
 
				-    - Virtual SOCL device has been removed
			
 
				-    - Automatic scheduling still available with command queues not
			
 
				-      assigned to any device
			
 
				-    - Remove modified OpenCL headers. ICD is now the only supported
			
 
				-      way to use SOCL.
			
 
				-    - SOCL test suite is only run when environment variable
			
 
				-      SOCL_OCL_LIB_OPENCL is defined. It should contain the location
			
 
				-      of the libOpenCL.so file of the OCL ICD implementation.
			
 
				-  * Fix main memory leak on multiple unregister/re-register.
			
 
				-  * Improve hwloc detection by configure
			
 
				-
			
 
				-Small changes:
			
 
				-  * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
			
 
				-	still available for compatibility reasons.
			
 
				-  * include/starpu.h includes all include/starpu_*.h files, applications
			
 
				-	therefore only need to have #include <starpu.h>
			
 
				-  * Active task wait is now included in blocked time.
			
 
				-  * Fix GCC plugin linking issues starting with GCC 4.7.
			
 
				-  * Fix forcing calibration of never-calibrated archs.
			
 
				-  * CUDA applications are no longer compiled with the "-arch sm_13"
			
 
				-    option. It is specifically added to applications which need it.
			
 
				-
			
 
				-StarPU 1.0.3 (svn revision 7379)
			
 
				-==============================================
			
 
				-
			
 
				-Changes:
			
 
				-  * Several bug fixes in the build system
			
 
				-  * Bug fixes in source code for non-Linux systems
			
 
				-  * Fix generating FXT traces bigger than 64MiB.
			
 
				-  * Improve ENODEV error detections in StarPU FFT
			
 
				-
			
 
				-StarPU 1.0.2 (svn revision xxx)
			
 
				-==============================================
			
 
				-
			
 
				-Changes:
			
 
				-  * Add starpu_block_shadow_filter_func_vector and an example.
			
 
				-  * Add tag dependency in trace-generated DAG.
			
 
				-  * Fix CPU binding for optimized CPU-GPU transfers.
			
 
				-  * Fix parallel tasks CPU binding and combined worker generation.
			
 
				-  * Fix generating FXT traces bigger than 64MiB.
			
 
				-
			
 
				-StarPU 1.0.1 (svn revision 6659)
			
 
				-==============================================
			
 
				-
			
 
				-Changes:
			
 
				-  * hwloc support. Warn users when hwloc is not found on the system and
			
 
				-	produce error when not explicitely disabled.
			
 
				-  * Several bug fixes
			
 
				-  * GCC plug-in
			
 
				-	- Add `#pragma starpu release'
			
 
				-	- Fix bug when using `acquire' pragma with function parameters
			
 
				-	- Slightly improve test suite coverage
			
 
				-	- Relax the GCC version check
			
 
				-  * Update SOCL to use new API
			
 
				-  * Documentation improvement.
			
 
				 
			
 
				 StarPU 1.0.0 (svn revision 6306)
			
 
				 ==============================================
			
@@ -302,9 +219,3 @@ Changes:
 
				    - transparent data coherency management
			
 
				    - High-level expressive interface
			
 
				 
			
 
				-
			
 
				-# Local Variables:
			
 
				-# mode: text
			
 
				-# coding: utf-8
			
 
				-# ispell-local-dictionary: "american"
			
 
				-# End:
			
--- a/configure.ac
+++ b/configure.ac
@@ -951,6 +951,7 @@ if test x$use_fxt = xyes; then
 
				 	else
			
 
				 	    PKG_CHECK_MODULES([FXT],  [fxt])
			
 
				 	fi
			
 
				+<<<<<<< .working
			
 
				 	save_LIBS="$LIBS"
			
 
				 	LIBS="$LIBS $FXT_LIBS"
			
 
				 	save_LDFLAGS="$LDFLAGS"
			
@@ -964,6 +965,8 @@ if test x$use_fxt = xyes; then
 
				 	AC_CHECK_DECLS([enable_fut_flush])
			
 
				 	AC_CHECK_DECLS([fut_set_filename])
			
 
				 	CFLAGS="$save_CFLAGS"
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 fi
			
 
				 
			
 
				 AC_MSG_CHECKING(whether performance debugging should be enabled)
			
@@ -1898,6 +1901,7 @@ AC_MSG_NOTICE([
 
				                SOCL test suite: $run_socl_check
			
 
				 >>>>>>> .merge-right.r7640
			
 
				 ])
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 if test "$build_socl" = "yes" -a "$run_socl_check" = "no" ; then
			
 
				 	AC_MSG_NOTICE([
			
@@ -1914,3 +1918,13 @@ WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
 
				 performance may be impacted a lot.  It is strongly recommended to install
			
 
				 hwloc])
			
 
				 fi
			
 
				+=======
			
 
				+
			
 
				+if test x"$have_valid_hwloc" = xno
			
 
				+then
			
 
				+  AC_MSG_NOTICE([
			
 
				+WARNING: hwloc was not enabled.  If the target machine is hyperthreaded the
			
 
				+performance may be impacted a lot.  It is strongly recommended to install
			
 
				+hwloc])
			
 
				+fi
			
 
				+>>>>>>> .merge-right.r6541
			
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -816,6 +816,7 @@ static struct starpu_sched_policy dummy_sched_policy = @{
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 @node Running drivers
			
 
				 @section Running drivers
			
 
				 
			
@@ -883,6 +884,58 @@ if (ret != 0)
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				+=======
			
 
				+@node Running drivers
			
 
				+@section Running drivers
			
 
				+
			
 
				+@menu
			
 
				+* Driver API::
			
 
				+* Example::
			
 
				+@end menu
			
 
				+
			
 
				+@node Driver API
			
 
				+@subsection Driver API
			
 
				+
			
 
				+@deftypefun int starpu_driver_init(struct starpu_driver *@var{d})
			
 
				+Initialize the given driver. Returns 0 on success, -EINVAL if d->type is not
			
 
				+STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_driver_run_once(struct starpu_driver *@var{d})
			
 
				+Runs the driver for a while, then returns 0 on success, -EINVAL if d->type is
			
 
				+not STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_driver_deinit(struct starpu_driver *@var{d})
			
 
				+Deinitialize the given driver. Returns 0 on success, -EINVAL if d->type is not
			
 
				+STARPU_CUDA_WORKER.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@node Example
			
 
				+@subsection Example
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+int ret;
			
 
				+struct starpu_driver = @{
			
 
				+    .type = STARPU_CUDA_WORKER,
			
 
				+    .id.cuda_id = 0
			
 
				+@};
			
 
				+ret = starpu_driver_init(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+while (some_condition) @{
			
 
				+    ret = starpu_driver_run_once(&d);
			
 
				+    if (ret != 0)
			
 
				+        error();
			
 
				+@}
			
 
				+ret = starpu_driver_deinit(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @node Expert mode
			
 
				 @section Expert mode
			
 
				 
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -903,6 +903,15 @@ some nodes of the hierarchy have a big arity (e.g. many cores in a socket
 
				 without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				 intermediate sizes. The @code{STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER} variable
			
 
				 permits to tune the maximum arity between levels of combined workers.
			
 
				+The user can give some hints to StarPU about combined workers sizes to favor.
			
 
				+This can be done by using the environment variables @code{STARPU_MIN_WORKERSIZE}
			
 
				+and @code{STARPU_MAX_WORKERSIZE}. When set, they will force StarPU to create the
			
 
				+biggest combined workers possible without overstepping the defined boundaries.
			
 
				+However, StarPU will create the remaining combined workers without abiding by
			
 
				+the rules if not possible.
			
 
				+For example : if the user specifies a minimum and maximum combined workers size
			
 
				+of 3 on a machine containing 8 CPUs, StarPU will create a combined worker of
			
 
				+size 2 beside the combined workers of size 3.
			
 
				 
			
 
				 The combined workers actually produced can be seen in the output of the
			
 
				 @code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variable
			
@@ -1041,6 +1050,7 @@ renderbuffer objects into CUDA.  CUDA however imposes some technical
 
				 constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
			
 
				 to be the one that runs CUDA computations for that GPU.
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
			
 
				 to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
			
 
				 first, and the interoperability mode has to
			
@@ -1057,6 +1067,21 @@ to make GLUT progress from the StarPU driver loop, while the latter uses
 
				 Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
			
 
				 the CUDA pointer at registration, for instance:
			
 
				 
			
 
				+=======
			
 
				+To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
			
 
				+to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
			
 
				+be enabled by using the @code{cuda_opengl_interoperability} field of the
			
 
				+@code{starpu_conf} structure, and the driver loop has to be run by
			
 
				+the application, by using the @code{not_launched_drivers} field of
			
 
				+@code{starpu_conf} to prevent StarPU from running it in a separate thread, and
			
 
				+by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} example
			
 
				+shows how it articulates in a simple case, where rendering is done in task
			
 
				+callbacks. TODO: provide glutIdleFunc alternative.
			
 
				+
			
 
				+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
			
 
				+the CUDA pointer at registration, for instance:
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				 /* Get the CUDA worker id */
			
@@ -1064,7 +1089,10 @@ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
				         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				                 break;
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 /* Build a CUDA pointer pointing at the OpenGL buffer */
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				 
			
 
				 /* And register it to StarPU */
			
@@ -1073,11 +1101,14 @@ starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
 
				 
			
 
				 /* The handle can now be used as usual */
			
 
				 starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 /* ... */
			
 
				 
			
 
				 /* This gets back data into the OpenGL buffer */
			
 
				 starpu_data_unregister(handle);
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -36,6 +36,7 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
 
				 indicates that no worker was available (so that StarPU was not initialized).
			
 
				 @end deftypefun
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 @deftp {Data Type} {struct starpu_driver}
			
 
				 @table @asis
			
 
				 @item @code{enum starpu_archtype type}
			
@@ -54,6 +55,24 @@ Should only be used if type is STARPU_OPENCL_WORKER.
 
				 @end deftp
			
 
				 
			
 
				 
			
 
				+=======
			
 
				+@deftp {Data Type} {struct starpu_driver}
			
 
				+@table @asis
			
 
				+@item @code{enum starpu_archtype type}
			
 
				+The type of the driver. Only STARPU_CUDA_DRIVER and STARPU_OPENCL_DRIVER are
			
 
				+currently supported.
			
 
				+@item @code{union id} Anonymous union
			
 
				+@table @asis
			
 
				+@item @code{unsigned cuda_id}
			
 
				+Should only be used if type is STARPU_CUDA_WORKER.
			
 
				+@item @code{cl_device_id opencl_id}
			
 
				+Should only be used if type is STARPU_OPENCL_WORKER.
			
 
				+@end table
			
 
				+@end table
			
 
				+@end deftp
			
 
				+
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @deftp {Data Type} {struct starpu_conf}
			
 
				 This structure is passed to the @code{starpu_init} function in order
			
 
				 to configure StarPU. It has to be initialized with @code{starpu_conf_init}.
			
@@ -151,6 +170,7 @@ CPUs and all accelerators. This can also be specified with the
 
				 The AMD implementation of OpenCL is known to
			
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				+<<<<<<< .working
			
 
				 This can also be specified at compilation time by giving to the
			
 
				 configure script the option @code{--disable-asynchronous-copy}.
			
 
				 
			
@@ -186,6 +206,24 @@ The drivers that should not be launched by StarPU.
 
				 @item @code{unsigned nnot_launched_drivers}
			
 
				 The number of StarPU drivers that should not be launched by StarPU.
			
 
				 
			
 
				+=======
			
 
				+
			
 
				+@item @code{int *cuda_opengl_interoperability} (default = NULL)
			
 
				+This can be set to an array of CUDA device identifiers for which
			
 
				+@code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
			
 
				+size is specified by the @code{n_cuda_opengl_interoperability} field below
			
 
				+
			
 
				+@item @code{int *n_cuda_opengl_interoperability} (default = 0)
			
 
				+This has to be set to the size of the array pointed to by the
			
 
				+@code{cuda_opengl_interoperability} field.
			
 
				+
			
 
				+@item @code{struct starpu_driver *not_launched_drivers}
			
 
				+The drivers that should not be launched by StarPU.
			
 
				+
			
 
				+@item @code{unsigned nnot_launched_drivers}
			
 
				+The number of StarPU drivers that should not be launched by StarPU.
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -274,7 +274,23 @@ By default, it is disabled.
 
				 @node Workers
			
 
				 @subsection Configuring workers
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 @table @code
			
 
				+=======
			
 
				+@menu
			
 
				+* STARPU_NCPUS::                	Number of CPU workers
			
 
				+* STARPU_NCUDA::                	Number of CUDA workers
			
 
				+* STARPU_NOPENCL::              	Number of OpenCL workers
			
 
				+* STARPU_NGORDON::              	Number of SPU workers (Cell)
			
 
				+* STARPU_WORKERS_NOBIND::       	Do not bind workers
			
 
				+* STARPU_WORKERS_CPUID::        	Bind workers to specific CPUs
			
 
				+* STARPU_WORKERS_CUDAID::       	Select specific CUDA devices
			
 
				+* STARPU_WORKERS_OPENCLID::     	Select specific OpenCL devices
			
 
				+* STARPU_SINGLE_COMBINED_WORKER:: 	Do not use concurrent workers
			
 
				+* STARPU_MIN_WORKERSIZE::	 	Minimum size of the combined workers
			
 
				+* STARPU_MAX_WORKERSIZE:: 		Maximum size of the combined workers
			
 
				+@end menu
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 @item @code{STARPU_NCPU}
			
 
				 Specify the number of CPU workers (thus not including workers dedicated to control acceleratores). Note that by default, StarPU will not allocate
			
@@ -339,6 +355,7 @@ OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
 
				 This variable is ignored if the @code{use_explicit_workers_opencl_gpuid} flag of
			
 
				 the @code{starpu_conf} structure passed to @code{starpu_init} is set.
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 @item @code{STARPU_SINGLE_COMBINED_WORKER}
			
 
				 If set, StarPU will create several workers which won't be able to work
			
 
				 concurrently. It will create combined workers which size goes from 1 to the
			
@@ -382,6 +399,27 @@ instead. This permits to test the performance effect of GPU-Direct.
 
				 
			
 
				 @end table
			
 
				 
			
 
				+=======
			
 
				+@node STARPU_SINGLE_COMBINED_WORKER
			
 
				+@subsubsection @code{STARPU_SINGLE_COMBINED_WORKER} -- Do not use concurrent workers
			
 
				+
			
 
				+If set, StarPU will create several workers which won't be able to work
			
 
				+concurrently. It will create combined workers which size goes from 1 to the
			
 
				+total number of CPU workers in the system.
			
 
				+
			
 
				+@node STARPU_MIN_WORKERSIZE
			
 
				+@subsubsection @code{STARPU_MIN_WORKERSIZE} -- Minimum size of the combined workers
			
 
				+
			
 
				+Let the user give a hint to StarPU about which how many workers
			
 
				+(minimum boundary) the combined workers should contain.
			
 
				+
			
 
				+@node STARPU_MAX_WORKERSIZE
			
 
				+@subsubsection @code{STARPU_MAX_WORKERSIZE} -- Maximum size of the combined workers
			
 
				+
			
 
				+Let the user give a hint to StarPU about which how many workers
			
 
				+(maximum boundary) the combined workers should contain.
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 @node Scheduling
			
 
				 @subsection Configuring the Scheduling engine
			
 
				 
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -852,6 +852,7 @@ pi_pi_redux_LDADD =				\
 
				 	$(STARPU_CURAND_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 ###########################
			
 
				 # OpenGL interoperability #
			
 
				 ###########################
			
@@ -887,6 +888,23 @@ pipeline_pipeline_LDADD =		\
 
				 	$(STARPU_BLAS_LDFLAGS)
			
 
				 endif
			
 
				 
			
 
				+=======
			
 
				+###########################
			
 
				+# OpenGL interoperability #
			
 
				+###########################
			
 
				+
			
 
				+if HAVE_OPENGL
			
 
				+examplebin_PROGRAMS +=				\
			
 
				+	gl_interop/gl_interop
			
 
				+
			
 
				+gl_interop_gl_interop_SOURCES =			\
			
 
				+	gl_interop/gl_interop.c
			
 
				+
			
 
				+gl_interop_gl_interop_LDADD =			\
			
 
				+	$(STARPU_OPENGL_RENDER_LDFLAGS)
			
 
				+endif
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
 
				 	for i in $(SUBDIRS) ; do \
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_cuda.h>
			
 
				+#include <starpu_opencl.h>
			
 
				+#include <starpu_hash.h>
			
 
				 
			
 
				 #include "complex_interface.h"
			
 
				 
			
@@ -134,7 +137,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, uint32
 
				 		}
			
 
				 #endif
			
 
				 		default:
			
 
				-			STARPU_ABORT();
			
 
				+			STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	if (fail)
			
@@ -161,43 +164,6 @@ static uint32_t complex_footprint(starpu_data_handle_t handle)
 
				 	return starpu_crc32_be(starpu_complex_get_nx(handle), 0);
			
 
				 }
			
 
				 
			
 
				-static void *complex_handle_to_pointer(starpu_data_handle_t handle, uint32_t node)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	return (void*) complex_interface->real;
			
 
				-}
			
 
				-
			
 
				-static int complex_pack_data(starpu_data_handle_t handle, uint32_t node, void **ptr)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	*ptr = malloc(complex_get_size(handle));
			
 
				-	memcpy(*ptr, complex_interface->real, complex_interface->nx*sizeof(double));
			
 
				-	memcpy(*ptr+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int complex_unpack_data(starpu_data_handle_t handle, uint32_t node, void *ptr)
			
 
				-{
			
 
				-	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
			
 
				-
			
 
				-	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, node);
			
 
				-
			
 
				-	memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
			
 
				-	memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				 {
			
@@ -238,10 +204,11 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				+static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				+	cl_event *event = (cl_event *)_event;
			
 
				 	cl_int err;
			
 
				 	int ret;
			
 
				 
			
@@ -277,10 +244,11 @@ static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_
 
				         return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				 }
			
 
				 
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				+static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *_event)
			
 
				 {
			
 
				 	struct starpu_complex_interface *src_complex = src_interface;
			
 
				 	struct starpu_complex_interface *dst_complex = dst_interface;
			
 
				+	cl_event *event = (cl_event *)_event;
			
 
				 	cl_int err;
			
 
				 	int ret;
			
 
				 
			
@@ -342,9 +310,6 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 	.footprint = complex_footprint,
			
 
				 	.interfaceid = -1,
			
 
				 	.interface_size = sizeof(struct starpu_complex_interface),
			
 
				-	.handle_to_pointer = complex_handle_to_pointer,
			
 
				-	.pack_data = complex_pack_data,
			
 
				-	.unpack_data = complex_unpack_data
			
 
				 };
			
 
				 
			
 
				 void starpu_complex_data_register(starpu_data_handle_t *handleptr, uint32_t home_node, double *real, double *imaginary, int nx)
			
--- a/gcc-plugin/src/Makefile.am
+++ b/gcc-plugin/src/Makefile.am
@@ -17,36 +17,21 @@
 
				 # requires a name prefixed by `lib'.
			
 
				 gccplugin_LTLIBRARIES = starpu.la
			
 
				 
			
 
				-starpu_la_SOURCES =				\
			
 
				-  c-expr.y					\
			
 
				-  opencl.c					\
			
 
				-  starpu.c					\
			
 
				-  tasks.c					\
			
 
				-  utils.c
			
 
				-
			
 
				-if HAVE_PTR_DEREFS_MAY_ALIAS_P
			
 
				-
			
 
				-# Only for GCC >= 4.6.
			
 
				-starpu_la_SOURCES += warn-unregistered.c
			
 
				-
			
 
				-endif
			
 
				+starpu_la_SOURCES = starpu.c c-expr.y
			
 
				 
			
 
				 # Use the Yacc-compatibility mode so that Bison doesn't error out upon
			
 
				 # reduce/reduce conflicts.
			
 
				 AM_YFLAGS = -y
			
 
				 
			
 
				 AM_CPPFLAGS =						\
			
 
				-  -I$(top_builddir)/gcc-plugin/include			\
			
 
				-  -I$(top_srcdir)/gcc-plugin/include			\
			
 
				   -I$(top_srcdir)/include				\
			
 
				-  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1	\
			
 
				+  -I$(GCC_PLUGIN_INCLUDE_DIR) -Wall -DYYERROR_VERBOSE=1 \
			
 
				   $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
			
 
				 
			
 
				-AM_LDFLAGS = -module
			
 
				+AM_LDFLAGS = -module --tag="$(GCC_FOR_PLUGIN_LIBTOOL_TAG)"
			
 
				 
			
 
				 # Use either `gcc' or `g++', whichever is appropriate to build
			
 
				 # plug-ins for this version of GCC.
			
 
				-AM_LIBTOOLFLAGS = --tag="$(GCC_FOR_PLUGIN_LIBTOOL_TAG)"
			
 
				 CC = $(GCC_FOR_PLUGIN)
			
 
				 
			
 
				 showcheck:
			
--- a/gcc-plugin/src/starpu.c
+++ b/gcc-plugin/src/starpu.c
--- a/gcc-plugin/tests/Makefile.am
+++ b/gcc-plugin/tests/Makefile.am
@@ -21,8 +21,6 @@ gcc_tests =					\
 
				   output-pointer-errors.c			\
			
 
				   register.c					\
			
 
				   register-errors.c				\
			
 
				-  registered.c					\
			
 
				-  registered-errors.c				\
			
 
				   acquire.c					\
			
 
				   acquire-errors.c				\
			
 
				   release.c					\
			
@@ -46,16 +44,6 @@ gcc_tests =					\
 
				 
			
 
				 EXTRA_DIST =
			
 
				 
			
 
				-if HAVE_PTR_DEREFS_MAY_ALIAS_P
			
 
				-
			
 
				-gcc_tests += warn-unregistered.c
			
 
				-
			
 
				-else !HAVE_PTR_DEREFS_MAY_ALIAS_P
			
 
				-
			
 
				-EXTRA_DIST += warn-unregistered.c
			
 
				-
			
 
				-endif !HAVE_PTR_DEREFS_MAY_ALIAS_P
			
 
				-
			
 
				 if !STARPU_USE_OPENCL
			
 
				 
			
 
				 # XXX: This test simulates a buggy OpenCL implementation, and thus
			
@@ -80,7 +68,6 @@ CLEANFILES = *.gimple *.o			\
 
				   base						\
			
 
				   pointers					\
			
 
				   register					\
			
 
				-  registered					\
			
 
				   release					\
			
 
				   scalar-tasks					\
			
 
				   pointer-tasks					\
			
@@ -98,21 +85,20 @@ EXTRA_DIST += ./run-test.in			\
 
				   $(gcc_tests)
			
 
				 
			
 
				 # The test suite assumes that the CPU back-end is available.
			
 
				-if RUN_GCC_PLUGIN_TESTS
			
 
				+if STARPU_USE_CPU
			
 
				 
			
 
				 TESTS = $(gcc_tests)
			
 
				-if STARPU_HAVE_AM111
			
 
				-LOG_COMPILER = ./run-test
			
 
				-else
			
 
				-TESTS_ENVIRONMENT = ./run-test
			
 
				+
			
 
				 endif
			
 
				 
			
 
				-else !RUN_GCC_PLUGIN_TESTS
			
 
				+TESTS_ENVIRONMENT = ./run-test
			
 
				+
			
 
				+if !HAVE_GUILE
			
 
				 
			
 
				 check-hook:
			
 
				-	-@echo "GNU Guile or CPU back-end not available, test suite not run."
			
 
				+	-@echo "GNU Guile not available, test suite not run."
			
 
				 
			
 
				-endif !RUN_GCC_PLUGIN_TESTS
			
 
				+endif !HAVE_GUILE
			
 
				 
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
--- a/gcc-plugin/tests/mocks.h
+++ b/gcc-plugin/tests/mocks.h
@@ -296,15 +296,7 @@ starpu_vector_data_register (starpu_data_handle_t *handle,
 
				 			     uint32_t home_node, uintptr_t ptr,
			
 
				 			     uint32_t count, size_t elemsize)
			
 
				 {
			
 
				-  /* Sometimes tests cannot tell what the pointer will be (for instance, for
			
 
				-     the `registered' attribute), and thus pass NULL as the expected
			
 
				-     pointer.  */
			
 
				-  if (expected_register_arguments.pointer != NULL)
			
 
				-    assert ((void *) ptr == expected_register_arguments.pointer);
			
 
				-  else
			
 
				-    /* Allow users to check the pointer afterward.  */
			
 
				-    expected_register_arguments.pointer = (void *) ptr;
			
 
				-
			
 
				+  assert ((void *) ptr == expected_register_arguments.pointer);
			
 
				   assert (count == expected_register_arguments.elements);
			
 
				   assert (elemsize == expected_register_arguments.element_size);
			
 
				 
			
@@ -432,41 +424,15 @@ starpu_free (void *ptr)
 
				 
			
 
				 /* OpenCL support.  */
			
 
				 
			
 
				-#ifndef STARPU_USE_OPENCL
			
 
				+#define STARPU_USE_OPENCL 1
			
 
				 
			
 
				-# define STARPU_USE_OPENCL 1
			
 
				-
			
 
				-/* The `opencl' pragma needs this structure, so make sure it's defined.  */
			
 
				 struct starpu_opencl_program
			
 
				 {
			
 
				   /* Nothing.  */
			
 
				 };
			
 
				 
			
 
				-typedef int cl_event;
			
 
				-typedef int cl_kernel;
			
 
				-typedef int cl_command_queue;
			
 
				-
			
 
				-extern cl_int clSetKernelArg (cl_kernel, cl_uint, size_t, const void *);
			
 
				-
			
 
				-extern cl_int
			
 
				-clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
			
 
				-                       cl_kernel        /* kernel */,
			
 
				-                       cl_uint          /* work_dim */,
			
 
				-                       const size_t *   /* global_work_offset */,
			
 
				-                       const size_t *   /* global_work_size */,
			
 
				-                       const size_t *   /* local_work_size */,
			
 
				-                       cl_uint          /* num_events_in_wait_list */,
			
 
				-                       const cl_event * /* event_wait_list */,
			
 
				-                       cl_event *       /* event */);
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-/* Number of `load_opencl_from_string', `load_kernel', and `clSetKernelArg'
			
 
				-   calls.  */
			
 
				-static unsigned int load_opencl_calls, load_opencl_kernel_calls,
			
 
				-  opencl_set_kernel_arg_calls, opencl_enqueue_calls, opencl_finish_calls,
			
 
				-  opencl_collect_stats_calls, opencl_release_event_calls;
			
 
				+/* Number of `load_opencl_from_string' calls.  */
			
 
				+static unsigned int load_opencl_calls;
			
 
				 
			
 
				 struct load_opencl_arguments
			
 
				 {
			
@@ -477,15 +443,6 @@ struct load_opencl_arguments
 
				 /* Expected arguments.  */
			
 
				 static struct load_opencl_arguments expected_load_opencl_arguments;
			
 
				 
			
 
				-struct cl_enqueue_kernel_arguments
			
 
				-{
			
 
				-  size_t * global_work_size;
			
 
				-};
			
 
				-
			
 
				-/* Variable describing the expected `clEnqueueNDRangeKernel' arguments. */
			
 
				-static struct cl_enqueue_kernel_arguments expected_cl_enqueue_kernel_arguments;
			
 
				-
			
 
				-
			
 
				 int
			
 
				 starpu_opencl_load_opencl_from_string (const char *source,
			
 
				 				       struct starpu_opencl_program *program,
			
@@ -497,112 +454,6 @@ starpu_opencl_load_opencl_from_string (const char *source,
 
				   return 0;
			
 
				 }
			
 
				 
			
 
				-int
			
 
				-starpu_opencl_load_kernel (cl_kernel *kernel,
			
 
				-			   cl_command_queue *queue,
			
 
				-			   struct starpu_opencl_program *programs,
			
 
				-			   const char *kernel_name, int devid)
			
 
				-{
			
 
				-  assert (kernel != NULL && queue != NULL && programs != NULL
			
 
				-	  && kernel_name != NULL && devid == -42);
			
 
				-  load_opencl_kernel_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-starpu_worker_get_id (void)
			
 
				-{
			
 
				-  return 42;
			
 
				-}
			
 
				-
			
 
				-int
			
 
				-starpu_worker_get_devid (int id)
			
 
				-{
			
 
				-  return -id;
			
 
				-}
			
 
				-
			
 
				-/* Set the INDEXth argument to KERNEL to the SIZE bytes pointed to by
			
 
				-   VALUE.  */
			
 
				-cl_int
			
 
				-clSetKernelArg (cl_kernel kernel, cl_uint index, size_t size,
			
 
				-		const void *value)
			
 
				-{
			
 
				-  size_t n;
			
 
				-  const struct insert_task_argument *arg;
			
 
				-
			
 
				-  for (n = 0, arg = expected_insert_task_arguments;
			
 
				-       n < index;
			
 
				-       n++, arg++)
			
 
				-    assert (arg->pointer != NULL);
			
 
				-
			
 
				-  switch (arg->type)
			
 
				-    {
			
 
				-    case STARPU_VALUE:
			
 
				-      assert (size == arg->size);
			
 
				-      assert (memcmp (arg->pointer, value, size) == 0);
			
 
				-      break;
			
 
				-
			
 
				-    case STARPU_RW:
			
 
				-    case STARPU_R:
			
 
				-    case STARPU_W:
			
 
				-      assert (size == sizeof (void *));
			
 
				-      assert (* (void **) value == arg->pointer);
			
 
				-      break;
			
 
				-
			
 
				-    default:
			
 
				-      abort ();
			
 
				-    }
			
 
				-
			
 
				-  opencl_set_kernel_arg_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-cl_int
			
 
				-clEnqueueNDRangeKernel(cl_command_queue command_queue,
			
 
				-                       cl_kernel        kernel,
			
 
				-                       cl_uint          work_dim,
			
 
				-                       const size_t *   global_work_offset,
			
 
				-                       const size_t *   global_work_size,
			
 
				-                       const size_t *   local_work_size,
			
 
				-                       cl_uint          num_events_in_wait_list,
			
 
				-                       const cl_event * event_wait_list,
			
 
				-                       cl_event *       event)
			
 
				-{
			
 
				-  assert (*local_work_size == 1);
			
 
				-  assert (*global_work_size == *expected_cl_enqueue_kernel_arguments.global_work_size);
			
 
				-
			
 
				-  opencl_enqueue_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-cl_int
			
 
				-clFinish (cl_command_queue command_queue)
			
 
				-{
			
 
				-  opencl_finish_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-cl_int
			
 
				-starpu_opencl_collect_stats (cl_event event)
			
 
				-{
			
 
				-  opencl_collect_stats_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-cl_int
			
 
				-clReleaseEvent (cl_event event)
			
 
				-{
			
 
				-  opencl_release_event_calls++;
			
 
				-  return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-const char *
			
 
				-starpu_opencl_error_string (cl_int s)
			
 
				-{
			
 
				-  return "mock";
			
 
				-}
			
 
				-
			
 
				 
			
 
				 /* Initialization.  */
			
 
				 
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -68,6 +68,7 @@ extern "C"
 
				 
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 enum starpu_archtype
			
 
				 {
			
 
				 	STARPU_CPU_WORKER,    /* CPU core */
			
@@ -138,6 +139,42 @@ struct starpu_driver
 
				 };
			
 
				 
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				+enum starpu_archtype
			
 
				+{
			
 
				+	STARPU_CPU_WORKER,    /* CPU core */
			
 
				+	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
			
 
				+	STARPU_OPENCL_WORKER, /* OpenCL device */
			
 
				+	STARPU_GORDON_WORKER  /* Cell SPU */
			
 
				+};
			
 
				+
			
 
				+struct starpu_driver
			
 
				+{
			
 
				+	enum starpu_archtype type;
			
 
				+	union
			
 
				+	{
			
 
				+		unsigned cuda_id;
			
 
				+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				+		cl_device_id opencl_id;
			
 
				+#endif
			
 
				+		/*
			
 
				+		 * TODO: handle CPUs:
			
 
				+		 * 1) Add a member to this union.
			
 
				+		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
			
 
				+		 *    not always launched.
			
 
				+		 * 3) Edit starpu_driver_run() so that it can handle another
			
 
				+		 *    kind of architecture.
			
 
				+		 * 4) Write _starpu_run_foobar() in the corresponding driver.
			
 
				+		 * 5) Test the whole thing :)
			
 
				+		 */
			
 
				+	} id;
			
 
				+};
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 struct starpu_conf
			
 
				 {
			
 
				 	/* Will be initialized by starpu_conf_init */
			
@@ -177,6 +214,7 @@ struct starpu_conf
 
				         /* indicate if all asynchronous copies should be disabled */
			
 
				 	int disable_asynchronous_copy;
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 
			
 
				         /* indicate if asynchronous copies to CUDA devices should be disabled */
			
 
				 	int disable_cuda_asynchronous_copy;
			
@@ -201,6 +239,16 @@ struct starpu_conf
 
				 	struct starpu_driver *not_launched_drivers;
			
 
				 	unsigned n_not_launched_drivers;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+
			
 
				+	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
			
 
				+	int *cuda_opengl_interoperability;
			
 
				+	unsigned n_cuda_opengl_interoperability;
			
 
				+
			
 
				+	/* A driver that the application will run in one of its own threads. */
			
 
				+	struct starpu_driver *not_launched_drivers;
			
 
				+	unsigned n_not_launched_drivers;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 };
			
 
				 
			
 
				 /* Initialize a starpu_conf structure with default values. */
			
@@ -245,6 +293,7 @@ int starpu_combined_worker_get_rank(void);
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 enum starpu_archtype
			
 
				 {
			
 
				 	STARPU_CPU_WORKER, /* CPU core */
			
@@ -259,6 +308,8 @@ enum starpu_archtype
 
				 >>>>>>> .merge-right.r6541
			
 
				 =======
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 /* This function returns the type of worker associated to an identifier (as
			
 
				  * returned by the starpu_worker_get_id function). The returned value indicates
			
@@ -305,6 +356,7 @@ void starpu_set_end_of_submissions(void);
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 int starpu_driver_init(struct starpu_driver *d);
			
 
				 int starpu_driver_run_once(struct starpu_driver *d);
			
 
				 int starpu_driver_deinit(struct starpu_driver *d);
			
@@ -332,6 +384,14 @@ int starpu_driver_init(struct starpu_driver *d);
 
				 int starpu_driver_run_once(struct starpu_driver *d);
			
 
				 int starpu_driver_deinit(struct starpu_driver *d);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+int starpu_driver_run(struct starpu_driver *);
			
 
				+void starpu_set_end_of_submissions(void);
			
 
				+
			
 
				+int starpu_driver_init(struct starpu_driver *d);
			
 
				+int starpu_driver_run_once(struct starpu_driver *d);
			
 
				+int starpu_driver_deinit(struct starpu_driver *d);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/.gitignore
+++ b/mpi/.gitignore
@@ -1 +0,0 @@
 
				-/.deps
			
--- a/mpi/starpu_mpi_insert_task_cache.c
+++ b/mpi/starpu_mpi_insert_task_cache.c
@@ -1,95 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu_mpi_private.h>
			
 
				-#include <starpu_mpi_insert_task_cache.h>
			
 
				-#include <starpu_hash.h>
			
 
				-#include <common/htable32.h>
			
 
				-
			
 
				-typedef struct _starpu_mpi_clear_cache_s {
			
 
				-        starpu_data_handle_t data;
			
 
				-        int rank;
			
 
				-        int mode;
			
 
				-} _starpu_mpi_clear_cache_t;
			
 
				-
			
 
				-struct starpu_htbl32_node **sent_data = NULL;
			
 
				-struct starpu_htbl32_node **received_data = NULL;
			
 
				-
			
 
				-void _starpu_mpi_clear_cache_callback(void *callback_arg)
			
 
				-{
			
 
				-        _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
			
 
				-        uint32_t key = starpu_crc32_be((uintptr_t)clear_cache->data, 0);
			
 
				-
			
 
				-        if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
			
 
				-                _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
			
 
				-                _starpu_htbl_insert_32(&sent_data[clear_cache->rank], key, NULL);
			
 
				-        }
			
 
				-        else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
			
 
				-                _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
			
 
				-                _starpu_htbl_insert_32(&received_data[clear_cache->rank], key, NULL);
			
 
				-        }
			
 
				-
			
 
				-        free(clear_cache);
			
 
				-}
			
 
				-
			
 
				-double _starpu_mpi_clear_cache_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel _starpu_mpi_clear_cache_model =
			
 
				-{
			
 
				-	.cost_function = _starpu_mpi_clear_cache_cost_function,
			
 
				-	.type = STARPU_COMMON,
			
 
				-};
			
 
				-
			
 
				-static void _starpu_mpi_clear_cache_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static struct starpu_codelet _starpu_mpi_clear_cache_codelet =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.cuda_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.opencl_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW},
			
 
				-	.model = &_starpu_mpi_clear_cache_model
			
 
				-	// The model has a cost function which returns 0 so as to allow the codelet to be scheduled anywhere
			
 
				-};
			
 
				-
			
 
				-void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode)
			
 
				-{
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	// We have a codelet with a empty function just to force the
			
 
				-	// task being created to have a dependency on data_handle
			
 
				-        task->cl = &_starpu_mpi_clear_cache_codelet;
			
 
				-        task->handles[0] = data_handle;
			
 
				-
			
 
				-        _starpu_mpi_clear_cache_t *clear_cache = malloc(sizeof(_starpu_mpi_clear_cache_t));
			
 
				-        clear_cache->data = data_handle;
			
 
				-        clear_cache->rank = rank;
			
 
				-        clear_cache->mode = mode;
			
 
				-
			
 
				-        task->callback_func = _starpu_mpi_clear_cache_callback;
			
 
				-        task->callback_arg = clear_cache;
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-}
			
 
				-
			
--- a/mpi/starpu_mpi_insert_task_cache.h
+++ b/mpi/starpu_mpi_insert_task_cache.h
@@ -1,26 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#define _STARPU_MPI_CLEAR_SENT_DATA     0
			
 
				-#define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
			
 
				-
			
 
				-extern struct starpu_htbl32_node **sent_data;
			
 
				-extern struct starpu_htbl32_node **received_data;
			
 
				-
			
 
				-void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode);
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -94,12 +94,17 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	size_t size = SIZE;
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	/* Initialize CUDA context on the device */
			
 
				 	cudaSetDevice(dev);
			
 
				 =======
			
 
				 	/* Initialize CUDA context on the device */
			
 
				 	starpu_cuda_set_device(dev);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	/* Initialize CUDA context on the device */
			
 
				+	starpu_cuda_set_device(dev);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -194,12 +199,17 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	/* Initialize CUDA context on the source */
			
 
				 	cudaSetDevice(src);
			
 
				 =======
			
 
				 	/* Initialize CUDA context on the source */
			
 
				 	starpu_cuda_set_device(src);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	/* Initialize CUDA context on the source */
			
 
				+	starpu_cuda_set_device(src);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
			
 
				 		cures = cudaDeviceCanAccessPeer(&can, src, dst);
			
@@ -217,12 +227,17 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	/* Initialize CUDA context on the destination */
			
 
				 	cudaSetDevice(dst);
			
 
				 =======
			
 
				 	/* Initialize CUDA context on the destination */
			
 
				 	starpu_cuda_set_device(dst);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	/* Initialize CUDA context on the destination */
			
 
				+	starpu_cuda_set_device(dst);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 	if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
			
 
				 		cures = cudaDeviceCanAccessPeer(&can, dst, src);
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -681,6 +681,7 @@ void _starpu_decrement_nsubmitted_tasks(void)
 
				 
			
 
				 }
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 void
			
 
				 starpu_drivers_request_termination(void)
			
 
				 {
			
@@ -697,6 +698,29 @@ starpu_drivers_request_termination(void)
 
				 	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
			
 
				 }
			
 
				 
			
 
				+=======
			
 
				+void
			
 
				+starpu_set_end_of_submissions(void)
			
 
				+{
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
 
				+
			
 
				+	config->submitting = 0;
			
 
				+	if (nsubmitted == 0) {
			
 
				+		config->running = 0;
			
 
				+		_STARPU_PTHREAD_COND_BROADCAST(&submitted_cond);
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(&submitted_mutex);
			
 
				+}
			
 
				+
			
 
				+void _starpu_check_nsubmitted_tasks(void)
			
 
				+{
			
 
				+
			
 
				+}
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 static void _starpu_increment_nsubmitted_tasks(void)
			
 
				 {
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&submitted_mutex);
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -252,6 +252,7 @@ static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 
				 }
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 /*
			
 
				  * Returns 0 if the given driver is one of the drivers that must be launched by
			
 
				  * the application itself, and not by StarPU, 1 otherwise.
			
@@ -367,6 +368,46 @@ static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
 
				 }
			
 
				 
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+/*
			
 
				+ * Returns 0 if the given driver is one of the drivers that must be launched by
			
 
				+ * the application itself, and not by StarPU, 1 otherwise.
			
 
				+ */
			
 
				+static unsigned _starpu_may_launch_driver(struct starpu_conf *conf,
			
 
				+					  struct starpu_driver *d)
			
 
				+{
			
 
				+	if (conf->n_not_launched_drivers == 0 ||
			
 
				+	    conf->not_launched_drivers == NULL)
			
 
				+		return 1;
			
 
				+
			
 
				+	/* Is <d> in conf->not_launched_drivers ? */
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < conf->n_not_launched_drivers; i++)
			
 
				+	{
			
 
				+		if (d->type != conf->not_launched_drivers[i].type)
			
 
				+			continue;
			
 
				+
			
 
				+		switch (d->type)
			
 
				+		{
			
 
				+		case STARPU_CUDA_WORKER:
			
 
				+			if (d->id.cuda_id == conf->not_launched_drivers[i].id.cuda_id)
			
 
				+				return 0;
			
 
				+			break;
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+		case STARPU_OPENCL_WORKER:
			
 
				+			if (d->id.opencl_id == conf->not_launched_drivers[i].id.opencl_id)
			
 
				+				return 0;
			
 
				+			break;
			
 
				+#endif
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 static void _starpu_launch_drivers(struct _starpu_machine_config *config)
			
 
				 {
			
 
				 	config->running = 1;
			
@@ -378,10 +419,14 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 
			
 
				 	/* Launch workers asynchronously (except for SPUs) */
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	unsigned cpu = 0, cuda = 0;
			
 
				 =======
			
 
				 	unsigned cuda = 0;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	unsigned cuda = 0;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	unsigned worker;
			
 
				 
			
 
				 #ifdef STARPU_PERF_DEBUG
			
@@ -459,6 +504,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				workerarg->set = NULL;
			
 
				 				workerarg->worker_is_initialized = 0;
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 				driver.id.cuda_id = cuda;
			
 
				 				if (_starpu_may_launch_driver(config->conf, &driver))
			
 
				 				{
			
@@ -482,11 +528,21 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				}
			
 
				 				cuda++;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+				driver.id.cuda_id = cuda;
			
 
				+				if (_starpu_may_launch_driver(config->conf, &driver))
			
 
				+				{
			
 
				+					pthread_create(&workerarg->worker_thread,
			
 
				+						       NULL, _starpu_cuda_worker, workerarg);
			
 
				+				}
			
 
				+				cuda++;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 				break;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 			case STARPU_OPENCL_WORKER:
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				 				{
			
@@ -498,6 +554,11 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				 					break;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
			
 
				+				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				+					break;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 				workerarg->set = NULL;
			
 
				 				workerarg->worker_is_initialized = 0;
			
 
				 				_STARPU_PTHREAD_CREATE(
			
@@ -545,11 +606,15 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 	}
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	cpu  = 0;
			
 
				 	cuda = 0;
			
 
				 =======
			
 
				 	cuda = 0;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	cuda = 0;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &config->workers[worker];
			
@@ -560,6 +625,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 		{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 				driver.id.cpu_id = cpu;
			
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
 
				 				{
			
@@ -579,6 +645,13 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *config)
 
				 				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				 				break;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+				_STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
			
 
				+				while (!workerarg->worker_is_initialized)
			
 
				+					_STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
			
 
				+				_STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
			
 
				+				break;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				 				driver.id.cuda_id = cuda;
			
 
				 				if (!_starpu_may_launch_driver(config->conf, &driver))
			
@@ -779,6 +852,7 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_starpu_start_fxt_profiling();
			
 
				 #endif
			
@@ -797,6 +871,8 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 >>>>>>> .merge-right.r6541
			
 
				 =======
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	/* store the pointer to the user explicit configuration during the
			
 
				 	 * initialization */
			
 
				 	if (user_conf == NULL)
			
@@ -820,6 +896,7 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	_starpu_init_all_sched_ctxs(&config);
			
 
				 =======
			
 
				 =======
			
@@ -859,6 +936,22 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 	_starpu_load_bus_performance_files();
			
 
				 
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+	_starpu_start_fxt_profiling();
			
 
				+#endif
			
 
				+
			
 
				+	_starpu_open_debug_logfile();
			
 
				+
			
 
				+	_starpu_data_interface_init();
			
 
				+
			
 
				+	_starpu_timing_init();
			
 
				+
			
 
				+	_starpu_profiling_init();
			
 
				+
			
 
				+	_starpu_load_bus_performance_files();
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	ret = _starpu_build_topology(&config);
			
 
				 	if (ret)
			
 
				 	{
			
@@ -1317,6 +1410,7 @@ int starpu_worker_get_nids_by_type(enum starpu_archtype type, int *workerids, in
 
				 }
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 <<<<<<< .working
			
 
				 
			
@@ -1692,3 +1786,98 @@ starpu_driver_deinit(struct starpu_driver *d)
 
				 	}
			
 
				 }
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern int _starpu_run_cuda(struct starpu_driver *);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+extern int _starpu_run_opencl(struct starpu_driver *);
			
 
				+#endif
			
 
				+
			
 
				+int
			
 
				+starpu_driver_run(struct starpu_driver *d)
			
 
				+{
			
 
				+	if (!d)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	switch (d->type)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		return _starpu_run_cuda(d);
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case STARPU_OPENCL_WORKER:
			
 
				+		return _starpu_run_opencl(d);
			
 
				+#endif
			
 
				+	case STARPU_CPU_WORKER:    /* Not supported yet */
			
 
				+	case STARPU_GORDON_WORKER: /* Not supported yet */
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern int _starpu_cuda_driver_init(struct starpu_driver *);
			
 
				+extern int _starpu_cuda_driver_run_once(struct starpu_driver *);
			
 
				+extern int _starpu_cuda_driver_deinit(struct starpu_driver *);
			
 
				+#endif
			
 
				+
			
 
				+int
			
 
				+starpu_driver_init(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+
			
 
				+	switch (d->type)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		return _starpu_cuda_driver_init(d);
			
 
				+#endif
			
 
				+	case STARPU_CPU_WORKER:    /* Not supported yet */
			
 
				+	case STARPU_OPENCL_WORKER: /* Not supported yet */
			
 
				+	case STARPU_GORDON_WORKER: /* Not supported yet */
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+starpu_driver_run_once(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+
			
 
				+	switch (d->type)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		return _starpu_cuda_driver_run_once(d);
			
 
				+#endif
			
 
				+	case STARPU_CPU_WORKER:    /* Not supported yet */
			
 
				+	case STARPU_OPENCL_WORKER: /* Not supported yet */
			
 
				+	case STARPU_GORDON_WORKER: /* Not supported yet */
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int
			
 
				+starpu_driver_deinit(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d);
			
 
				+
			
 
				+	switch (d->type)
			
 
				+	{
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case STARPU_CUDA_WORKER:
			
 
				+		return _starpu_cuda_driver_deinit(d);
			
 
				+#endif
			
 
				+	case STARPU_CPU_WORKER:    /* Not supported yet */
			
 
				+	case STARPU_OPENCL_WORKER: /* Not supported yet */
			
 
				+	case STARPU_GORDON_WORKER: /* Not supported yet */
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+}
			
 
				+>>>>>>> .merge-right.r6541
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -71,6 +71,7 @@ struct _starpu_worker
 
				         pthread_cond_t ready_cond; /* indicate when the worker is ready */
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	unsigned memory_node; /* which memory node is the worker associated with ? */
			
 
				 	pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
			
 
				 	pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
			
@@ -79,6 +80,9 @@ struct _starpu_worker
 
				 =======
			
 
				 	unsigned memory_node; /* which memory node is the worker associated with ? */
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	unsigned memory_node; /* which memory node is the worker associated with ? */
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	pthread_cond_t *sched_cond; /* condition variable used when the worker waits for tasks. */
			
 
				 	pthread_mutex_t *sched_mutex; /* mutex protecting sched_cond */
			
 
				 >>>>>>> .merge-right.r6541
			
@@ -186,6 +190,7 @@ struct _starpu_machine_config
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 	/* all the sched ctx of the current instance of starpu */
			
 
				 	struct _starpu_sched_ctx sched_ctxs[STARPU_NMAX_SCHED_CTXS];
			
@@ -204,6 +209,11 @@ struct _starpu_machine_config
 
				 	/* this flag is set until the application is finished submitting tasks */
			
 
				 	unsigned submitting;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+
			
 
				+	/* this flag is set until the application is finished submitting tasks */
			
 
				+	unsigned submitting;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 };
			
 
				 
			
 
				 /* Has starpu_shutdown already been called ? */
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -383,6 +383,7 @@ int _starpu_cpu_driver_deinit(struct starpu_driver *d STARPU_ATTRIBUTE_UNUSED)
 
				 
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CPU_KEY);
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -401,6 +402,8 @@ _starpu_cpu_worker(void *arg)
 
				 		_starpu_cpu_driver_run_once(&d);
			
 
				 	_starpu_cpu_driver_deinit(&d);
			
 
				 
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -29,6 +29,7 @@
 
				 #include <core/sched_policy.h>
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 #include <core/sched_ctx.h>
			
 
				 =======
			
 
				 =======
			
@@ -40,6 +41,9 @@
 
				 =======
			
 
				 #endif
			
 
				 >>>>>>> .merge-right.r7640
			
 
				+=======
			
 
				+#include <cuda_gl_interop.h>
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 /* the number of CUDA devices */
			
 
				 static int ncudagpus;
			
@@ -139,11 +143,17 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
				 void starpu_cuda_set_device(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				+<<<<<<< .working
			
 
				 	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
			
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				 	unsigned i;
			
 
				 #endif
			
 
				+=======
			
 
				+	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
			
 
				+	unsigned i;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 	if (conf->n_cuda_opengl_interoperability) {
			
 
				 		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
@@ -162,15 +172,36 @@ void starpu_cuda_set_device(int devid)
 
				 		}
			
 
				 #endif
			
 
				 
			
 
				+=======
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	if (conf->n_cuda_opengl_interoperability) {
			
 
				+		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
			
 
				+		STARPU_ASSERT(0);
			
 
				+	}
			
 
				+#else
			
 
				+	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
			
 
				+		if (conf->cuda_opengl_interoperability[i] == devid) {
			
 
				+			cures = cudaGLSetGLDevice(devid);
			
 
				+			goto done;
			
 
				+		}
			
 
				+#endif
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	cures = cudaSetDevice(devid);
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
			
 
				 done:
			
 
				 #endif
			
 
				+=======
			
 
				+
			
 
				+done:
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 }
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 static void init_context(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
@@ -198,6 +229,15 @@ static void init_context(int devid)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				+=======
			
 
				+static void init_context(int devid)
			
 
				+{
			
 
				+	cudaError_t cures;
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+
			
 
				+	starpu_cuda_set_device(devid);
			
 
				+
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				 	cures = cudaFree(0);
			
 
				 	if (STARPU_UNLIKELY(cures)) {
			
@@ -321,6 +361,7 @@ static int execute_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *arg
 
				 static struct _starpu_worker*
			
 
				 _starpu_get_worker_from_driver(struct starpu_driver *d)
			
 
				 {
			
 
				+<<<<<<< .working
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
 
				 	unsigned  workerid;
			
 
				 	for (workerid = 0; workerid < nworkers; workerid++)
			
@@ -333,19 +374,51 @@ _starpu_get_worker_from_driver(struct starpu_driver *d)
 
				 				return worker;
			
 
				 		}
			
 
				 	}
			
 
				+=======
			
 
				+	int workers[d->id.cuda_id + 1];
			
 
				+	int nworkers;
			
 
				+	nworkers = starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, workers, d->id.cuda_id+1);
			
 
				+	if (nworkers >= 0 && (unsigned) nworkers < d->id.cuda_id)
			
 
				+		return NULL; // No device was found.
			
 
				+	
			
 
				+	return _starpu_get_worker_struct(workers[d->id.cuda_id]);
			
 
				+}
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	return NULL;
			
 
				 }
			
 
				+=======
			
 
				+/* XXX Should this be merged with _starpu_init_cuda ? */
			
 
				+int _starpu_cuda_driver_init(struct starpu_driver *d)
			
 
				+{
			
 
				+	struct _starpu_worker* args = _starpu_get_worker_from_driver(d);
			
 
				+	STARPU_ASSERT(args);
			
 
				 
			
 
				+	int devid = args->devid;
			
 
				+	unsigned memory_node = args->memory_node;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				+
			
 
				+<<<<<<< .working
			
 
				 /* XXX Should this be merged with _starpu_init_cuda ? */
			
 
				 int _starpu_cuda_driver_init(struct starpu_driver *d)
			
 
				 {
			
 
				 	struct _starpu_worker* args = _starpu_get_worker_from_driver(d);
			
 
				 	STARPU_ASSERT(args);
			
 
				+=======
			
 
				+#ifdef STARPU_USE_FXT
			
 
				+	_starpu_fxt_register_thread(args->bindid);
			
 
				+#endif
			
 
				+	_STARPU_TRACE_WORKER_INIT_START(_STARPU_FUT_CUDA_KEY, devid, memory_node);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 	int devid = args->devid;
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	_starpu_worker_init(args, _STARPU_FUT_CUDA_KEY);
			
 
				+=======
			
 
				+	_starpu_set_local_memory_node_key(&args->memory_node);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 	init_context(devid);
			
 
				 
			
@@ -384,6 +457,7 @@ int _starpu_cuda_driver_init(struct starpu_driver *d)
 
				 }
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 	pthread_cond_t *sched_cond = &args->sched_cond;
			
 
				 	pthread_mutex_t *sched_mutex = &args->sched_mutex;
			
 
				 	struct timespec start_time, end_time;
			
@@ -399,19 +473,37 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 	struct _starpu_worker* args = _starpu_get_worker_from_driver(d);
			
 
				 	STARPU_ASSERT(args);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+int _starpu_cuda_driver_run_once(struct starpu_driver *d)
			
 
				+{
			
 
				+	struct _starpu_worker* args = _starpu_get_worker_from_driver(d);
			
 
				+	STARPU_ASSERT(args);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 =======
			
 
				 	unsigned memnode = args->memory_node;
			
 
				 	int workerid = args->workerid;
			
 
				+=======
			
 
				+	unsigned memnode = args->memory_node;
			
 
				+	int workerid = args->workerid;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 >>>>>>> .merge-right.r6541
			
 
				 	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 	_starpu_datawizard_progress(memnode, 1);
			
 
				 	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+=======
			
 
				+	_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+	_starpu_datawizard_progress(memnode, 1);
			
 
				+	_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 <<<<<<< .working
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 		if (!task) 
			
 
				 		{
			
 
				 			_STARPU_PTHREAD_MUTEX_LOCK(sched_mutex);
			
@@ -427,8 +519,12 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 =======
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(args->sched_mutex);
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	_STARPU_PTHREAD_MUTEX_LOCK(args->sched_mutex);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 			_STARPU_PTHREAD_MUTEX_UNLOCK(sched_mutex);
			
 
				 =======
			
 
				 	struct starpu_task *task = _starpu_pop_task(args);
			
@@ -437,10 +533,22 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 >>>>>>> .merge-right.r7640
			
 
				 	struct _starpu_job *j = NULL;
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+	struct starpu_task *task = _starpu_pop_task(args);
			
 
				+	struct _starpu_job *j = NULL;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	task = _starpu_get_worker_task(args, workerid, memnode);
			
 
				+=======
			
 
				+	if (task == NULL)
			
 
				+	{
			
 
				+		if (_starpu_worker_can_block(memnode))
			
 
				+			_starpu_block_worker(workerid, args->sched_cond, args->sched_mutex);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 		if(idle)
			
 
				 		{
			
 
				 			_starpu_clock_gettime(&end_time);
			
@@ -454,14 +562,38 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 			}
			
 
				 			idle = 0;
			
 
				 		}
			
 
				+=======
			
 
				+		_STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	_STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 =======
			
 
				 	if (!task)
			
 
				 >>>>>>> .merge-right.r7640
			
 
				 		return 0;
			
 
				+=======
			
 
				+	STARPU_ASSERT(task);
			
 
				+	j = _starpu_get_job_associated_to_task(task);
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	j = _starpu_get_job_associated_to_task(task);
			
 
				+=======
			
 
				+	/* can CUDA do that task ? */
			
 
				+	if (!_STARPU_CUDA_MAY_PERFORM(j))
			
 
				+	{
			
 
				+		/* this is neither a cuda or a cublas task */
			
 
				+		_starpu_push_task(j);
			
 
				+		return 0;
			
 
				+	}
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	/* can CUDA do that task ? */
			
 
				 	if (!_STARPU_CUDA_MAY_PERFORM(j))
			
 
				 	{
			
@@ -469,7 +601,12 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 		_starpu_push_task(j);
			
 
				 		return 0;
			
 
				 	}
			
 
				+=======
			
 
				+	_starpu_set_current_task(task);
			
 
				+	args->current_task = j->task;
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	_starpu_set_current_task(task);
			
 
				 	args->current_task = j->task;
			
 
				 
			
@@ -481,6 +618,16 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 	if (res)
			
 
				 	{
			
 
				 		switch (res)
			
 
				+=======
			
 
				+	int res = execute_job_on_cuda(j, args);
			
 
				+
			
 
				+	_starpu_set_current_task(NULL);
			
 
				+	args->current_task = NULL;
			
 
				+
			
 
				+	if (res)
			
 
				+	{
			
 
				+		switch (res)
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 		{
			
 
				 			case -EAGAIN:
			
 
				 				_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
			
@@ -490,10 +637,13 @@ int _starpu_cuda_driver_run_once(struct starpu_driver *d)
 
				 				STARPU_ABORT();
			
 
				 		}
			
 
				 <<<<<<< .working
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 		_starpu_handle_job_termination(j, workerid);
			
 
				 =======
			
 
				 >>>>>>> .merge-right.r6541
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	}
			
 
				 
			
 
				 	_starpu_handle_job_termination(j);
			
@@ -579,6 +729,7 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
 
				 	printf("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
			
 
				 	STARPU_ABORT();
			
 
				 }
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				 {
			
@@ -631,3 +782,57 @@ int _starpu_run_cuda(struct starpu_driver *d)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+=======
			
 
				+
			
 
				+int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+	cudaError_t cures = 0;
			
 
				+
			
 
				+	if (stream)
			
 
				+	{
			
 
				+	     _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+	     cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
			
 
				+	     _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				+	}
			
 
				+	/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				+	if (stream == NULL || cures)
			
 
				+	{
			
 
				+		/* do it in a synchronous fashion */
			
 
				+		cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
			
 
				+
			
 
				+		if (STARPU_UNLIKELY(cures))
			
 
				+			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+int _starpu_run_cuda(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d && d->type == STARPU_CUDA_WORKER);
			
 
				+
			
 
				+	int workers[d->id.cuda_id + 1];
			
 
				+	int nworkers;
			
 
				+	nworkers = starpu_worker_get_ids_by_type(STARPU_CUDA_WORKER, workers, d->id.cuda_id+1);
			
 
				+	if (nworkers >= 0 && (unsigned) nworkers < d->id.cuda_id)
			
 
				+		return -ENODEV;
			
 
				+	
			
 
				+	_STARPU_DEBUG("Running cuda %d from the application\n", d->id.cuda_id);
			
 
				+
			
 
				+	struct _starpu_worker *workerarg = _starpu_get_worker_struct(workers[d->id.cuda_id]);
			
 
				+
			
 
				+	workerarg->set = NULL;
			
 
				+	workerarg->worker_is_initialized = 0;
			
 
				+
			
 
				+	/* Let's go ! */
			
 
				+	_starpu_cuda_worker(workerarg);
			
 
				+
			
 
				+	/* XXX: Should we wait for the driver to be ready, as it is done when
			
 
				+	 * launching it the usual way ? Cf. the end of _starpu_launch_drivers()
			
 
				+	 */
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+>>>>>>> .merge-right.r6541
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -596,11 +596,14 @@ void *_starpu_opencl_worker(void *arg)
 
				 		.id.opencl_id = id
			
 
				 	};
			
 
				 
			
 
				+<<<<<<< .working
			
 
				 	_starpu_opencl_driver_init(&d);
			
 
				 	while (_starpu_machine_is_running())
			
 
				 		_starpu_opencl_driver_run_once(&d);
			
 
				 	_starpu_opencl_driver_deinit(&d);
			
 
				 
			
 
				+=======
			
 
				+>>>>>>> .merge-right.r6541
			
 
				 	return NULL;
			
 
				 }
			
 
				 
			
@@ -684,6 +687,7 @@ static int _starpu_opencl_execute_job(struct _starpu_job *j, struct _starpu_work
 
				 
			
 
				 	return EXIT_SUCCESS;
			
 
				 }
			
 
				+<<<<<<< .working
			
 
				 
			
 
				 int _starpu_run_opencl(struct starpu_driver *d)
			
 
				 {
			
@@ -723,3 +727,44 @@ int _starpu_run_opencl(struct starpu_driver *d)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+=======
			
 
				+
			
 
				+int _starpu_run_opencl(struct starpu_driver *d)
			
 
				+{
			
 
				+	STARPU_ASSERT(d && d->type == STARPU_OPENCL_WORKER);
			
 
				+
			
 
				+	int nworkers;
			
 
				+	int workers[STARPU_MAXOPENCLDEVS];
			
 
				+	nworkers = starpu_worker_get_ids_by_type(STARPU_OPENCL_WORKER, workers, STARPU_MAXOPENCLDEVS);
			
 
				+	if (nworkers == 0)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		cl_device_id device;
			
 
				+		int devid = starpu_worker_get_devid(workers[i]);
			
 
				+		starpu_opencl_get_device(devid, &device);
			
 
				+		if (device == d->id.opencl_id)
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	if (i == nworkers)
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	struct _starpu_worker *workerarg = _starpu_get_worker_struct(i);
			
 
				+	_STARPU_DEBUG("Running OpenCL %d from the application\n", workerarg->devid);
			
 
				+
			
 
				+	workerarg->set = NULL;
			
 
				+	workerarg->worker_is_initialized = 0;
			
 
				+
			
 
				+	/* Let's go ! */
			
 
				+	_starpu_opencl_worker(workerarg);
			
 
				+
			
 
				+	/* XXX: Should we wait for the driver to be ready, as it is done when
			
 
				+	 * launching it the usual way ? Cf. the end of _starpu_launch_drivers()
			
 
				+	 */
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+>>>>>>> .merge-right.r6541
			
--- a/tests/datawizard/write_only_tmp_buffer.c
+++ b/tests/datawizard/write_only_tmp_buffer.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -28,6 +28,8 @@
 
				 starpu_data_handle_t v_handle;
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				+#include <starpu_opencl.h>
			
 
				+
			
 
				 static void opencl_codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
@@ -51,8 +53,7 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 
				 
			
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				-	cudaMemsetAsync(buf, 42, 1, starpu_cuda_get_local_stream());
			
 
				-	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+	cudaMemset(buf, 42, 1);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -72,7 +73,7 @@ static void display_var(void *descr[], __attribute__ ((unused)) void *_args)
 
				 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 	if (*buf != 42)
			
 
				 	{
			
 
				-		FPRINTF(stderr, "Value = <%c> (should be <%c>)\n", *buf, 42);
			
 
				+		FPRINTF(stderr, "Value = '%c' (should be '%c')\n", *buf, 42);
			
 
				 		exit(-1);
			
 
				 	}
			
 
				 }
			
--- a/tests/parallel_tasks/explicit_combined_worker.c
+++ b/tests/parallel_tasks/explicit_combined_worker.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,68 +15,62 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				 #include <limits.h>
			
 
				 #include <unistd.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-//static pthread_mutex_t mutex;
			
 
				-//static pthread_cond_t cond;
			
 
				-//static unsigned finished = 0;
			
 
				-
			
 
				-static unsigned cnt;
			
 
				-
			
 
				-starpu_data_handle v_handle;
			
 
				-static unsigned *v;
			
 
				-
			
 
				 static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	int worker_size = starpu_combined_worker_get_size();
			
 
				-	assert(worker_size > 0);
			
 
				+	STARPU_ASSERT(worker_size > 0);
			
 
				 	usleep(1000/worker_size);
			
 
				 #if 0
			
 
				 	int id = starpu_worker_get_id();
			
 
				 	int combined_id = starpu_combined_worker_get_id();
			
 
				-	fprintf(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
			
 
				+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.type = STARPU_FORKJOIN,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = codelet_null,
			
 
				-	.cuda_func = codelet_null,
			
 
				-        .opencl_func = codelet_null,
			
 
				-	.nbuffers = 1
			
 
				+	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cuda_funcs = {codelet_null, NULL},
			
 
				+        .opencl_funcs = {codelet_null, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				 };
			
 
				 
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-//        struct starpu_conf conf = {
			
 
				-//                .sched_policy_name = "pheft",
			
 
				-//                .ncpus = -1,
			
 
				-//                .ncuda = -1,
			
 
				-//                .nopencl = -1,
			
 
				-//                .nspus = -1,
			
 
				-//                .use_explicit_workers_bindid = 0,
			
 
				-//                .use_explicit_workers_cuda_gpuid = 0,
			
 
				-//                .use_explicit_workers_opencl_gpuid = 0,
			
 
				-//                .calibrate = -1
			
 
				-//        };
			
 
				-
			
 
				-	starpu_init(NULL);
			
 
				-
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				+	starpu_data_handle_t v_handle;
			
 
				+	unsigned *v;
			
 
				+	int ret;
			
 
				+
			
 
				+//      struct starpu_conf conf;
			
 
				+//      starpu_conf_init(&conf);
			
 
				+//      conf.sched_policy_name = "pheft";
			
 
				+//      conf.calibrate = 1;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				 
			
 
				 	unsigned nworker = starpu_worker_get_count() + starpu_combined_worker_get_count();
			
 
				 
			
 
				-	cnt = nworker*N;
			
 
				-
			
 
				 	unsigned iter, worker;
			
 
				 	for (iter = 0; iter < N; iter++)
			
 
				 	{
			
@@ -86,27 +80,32 @@ int main(int argc, char **argv)
 
				 			struct starpu_task *task = starpu_task_create();
			
 
				 			task->cl = &cl;
			
 
				 
			
 
				-			task->buffers[0].handle = v_handle;
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				+			task->handles[0] = v_handle;
			
 
				 
			
 
				 			task->execute_on_a_specific_worker = 1;
			
 
				 			task->workerid = worker;
			
 
				 
			
 
				 			int ret = starpu_task_submit(task);
			
 
				-			if (ret == -ENODEV)
			
 
				-				goto enodev;
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	return EXIT_SUCCESS;
			
 
				 
			
 
				 enodev:
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 0;
			
 
				+	starpu_shutdown();
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				 }
			
--- a/tests/parallel_tasks/parallel_kernels.c
+++ b/tests/parallel_tasks/parallel_kernels.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,74 +15,69 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				 #include <limits.h>
			
 
				 #include <unistd.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-//static pthread_mutex_t mutex;
			
 
				-//static pthread_cond_t cond;
			
 
				-//static unsigned finished = 0;
			
 
				-
			
 
				-static unsigned cnt;
			
 
				-
			
 
				-starpu_data_handle v_handle;
			
 
				-static unsigned *v;
			
 
				-
			
 
				 static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	int worker_size = starpu_combined_worker_get_size();
			
 
				-	assert(worker_size > 0);
			
 
				+	STARPU_ASSERT(worker_size > 0);
			
 
				 	usleep(1000/worker_size);
			
 
				 #if 0
			
 
				 	int id = starpu_worker_get_id();
			
 
				 	int combined_id = starpu_combined_worker_get_id();
			
 
				-	fprintf(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
			
 
				+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d\n", id, combined_id, worker_size);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_t model = {
			
 
				+struct starpu_perfmodel model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "parallel_kernel_test"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.type = STARPU_FORKJOIN,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = codelet_null,
			
 
				-	.cuda_func = codelet_null,
			
 
				-        .opencl_func = codelet_null,
			
 
				+	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cuda_funcs = {codelet_null, NULL},
			
 
				+        .opencl_funcs = {codelet_null, NULL},
			
 
				 	.model = &model,
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				 };
			
 
				 
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-        struct starpu_conf conf = {
			
 
				-                .sched_policy_name = "pheft",
			
 
				-                .ncpus = -1,
			
 
				-                .ncuda = -1,
			
 
				-                .nopencl = -1,
			
 
				-                .nspus = -1,
			
 
				-                .use_explicit_workers_bindid = 0,
			
 
				-                .use_explicit_workers_cuda_gpuid = 0,
			
 
				-                .use_explicit_workers_opencl_gpuid = 0,
			
 
				-                .calibrate = 1
			
 
				-        };
			
 
				-
			
 
				-	starpu_init(&conf);
			
 
				-
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t v_handle;
			
 
				+	unsigned *v;
			
 
				+
			
 
				+        struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy_name = "pheft";
			
 
				+	conf.calibrate = 1;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				 
			
 
				 	unsigned nworker = starpu_worker_get_count() + starpu_combined_worker_get_count();
			
 
				 
			
 
				-	cnt = nworker*N;
			
 
				-
			
 
				 	unsigned iter, worker;
			
 
				 	for (iter = 0; iter < N; iter++)
			
 
				 	{
			
@@ -92,24 +87,29 @@ int main(int argc, char **argv)
 
				 			struct starpu_task *task = starpu_task_create();
			
 
				 			task->cl = &cl;
			
 
				 
			
 
				-			task->buffers[0].handle = v_handle;
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				+			task->handles[0] = v_handle;
			
 
				 
			
 
				 			int ret = starpu_task_submit(task);
			
 
				-			if (ret == -ENODEV)
			
 
				-				goto enodev;
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	return 0;
			
 
				+	STARPU_RETURN(EXIT_SUCCESS);
			
 
				 
			
 
				 enodev:
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 0;
			
 
				+	starpu_shutdown();
			
 
				+	STARPU_RETURN(STARPU_TEST_SKIPPED);
			
 
				 }
			
--- a/tests/parallel_tasks/parallel_kernels_spmd.c
+++ b/tests/parallel_tasks/parallel_kernels_spmd.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,76 +15,71 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				 #include <limits.h>
			
 
				 #include <unistd.h>
			
 
				+#include "../helper.h"
			
 
				 
			
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-//static pthread_mutex_t mutex;
			
 
				-//static pthread_cond_t cond;
			
 
				-//static unsigned finished = 0;
			
 
				-
			
 
				-static unsigned cnt;
			
 
				-
			
 
				-starpu_data_handle v_handle;
			
 
				-static unsigned *v;
			
 
				-
			
 
				 static void codelet_null(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				 {
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+
			
 
				 	int worker_size = starpu_combined_worker_get_size();
			
 
				-	assert(worker_size > 0);
			
 
				+	STARPU_ASSERT(worker_size > 0);
			
 
				 
			
 
				 	usleep(1000/worker_size);
			
 
				 #if 0
			
 
				 	int id = starpu_worker_get_id();
			
 
				 	int combined_id = starpu_combined_worker_get_id();
			
 
				 	int rank = starpu_combined_worker_get_rank();
			
 
				-	fprintf(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
			
 
				+	FPRINTF(stderr, "worker id %d - combined id %d - worker size %d - SPMD rank %d\n", id, combined_id, worker_size, rank);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-struct starpu_perfmodel_t model = {
			
 
				+struct starpu_perfmodel model =
			
 
				+{
			
 
				 	.type = STARPU_HISTORY_BASED,
			
 
				 	.symbol = "parallel_kernel_test_spmd"
			
 
				 };
			
 
				 
			
 
				-static starpu_codelet cl = {
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				 	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				-	.cpu_func = codelet_null,
			
 
				-	.cuda_func = codelet_null,
			
 
				-        .opencl_func = codelet_null,
			
 
				+	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cuda_funcs = {codelet_null, NULL},
			
 
				+        .opencl_funcs = {codelet_null, NULL},
			
 
				 	.model = &model,
			
 
				-	.nbuffers = 1
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R}
			
 
				 };
			
 
				 
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-        struct starpu_conf conf = {
			
 
				-                .sched_policy_name = "pheft",
			
 
				-                .ncpus = -1,
			
 
				-                .ncuda = -1,
			
 
				-                .nopencl = -1,
			
 
				-                .nspus = -1,
			
 
				-                .use_explicit_workers_bindid = 0,
			
 
				-                .use_explicit_workers_cuda_gpuid = 0,
			
 
				-                .use_explicit_workers_opencl_gpuid = 0,
			
 
				-                .calibrate = 1
			
 
				-        };
			
 
				-
			
 
				-	starpu_init(&conf);
			
 
				-
			
 
				-	starpu_data_malloc_pinned_if_possible((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				+	int ret;
			
 
				+	starpu_data_handle_t v_handle;
			
 
				+	unsigned *v;
			
 
				+
			
 
				+        struct starpu_conf conf;
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy_name = "pheft";
			
 
				+	conf.calibrate = 1;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_malloc((void **)&v, VECTORSIZE*sizeof(unsigned));
			
 
				 	starpu_vector_data_register(&v_handle, 0, (uintptr_t)v, VECTORSIZE, sizeof(unsigned));
			
 
				 
			
 
				 	unsigned nworker = starpu_worker_get_count() + starpu_combined_worker_get_count();
			
 
				 
			
 
				-	cnt = nworker*N;
			
 
				-
			
 
				 	unsigned iter, worker;
			
 
				 	for (iter = 0; iter < N; iter++)
			
 
				 	{
			
@@ -94,24 +89,28 @@ int main(int argc, char **argv)
 
				 			struct starpu_task *task = starpu_task_create();
			
 
				 			task->cl = &cl;
			
 
				 
			
 
				-			task->buffers[0].handle = v_handle;
			
 
				-			task->buffers[0].mode = STARPU_R;
			
 
				+			task->handles[0] = v_handle;
			
 
				 
			
 
				 			int ret = starpu_task_submit(task);
			
 
				-			if (ret == -ENODEV)
			
 
				-				goto enodev;
			
 
				+			if (ret == -ENODEV) goto enodev;
			
 
				+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				+	ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				 
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				+	STARPU_RETURN(EXIT_SUCCESS);
			
 
				 
			
 
				 enodev:
			
 
				+	starpu_data_unregister(v_handle);
			
 
				+	starpu_free(v);
			
 
				 	fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				 	/* yes, we do not perform the computation but we did detect that no one
			
 
				  	 * could perform the kernel, so this is not an error from StarPU */
			
 
				-	return 0;
			
 
				+	starpu_shutdown();
			
 
				+	STARPU_RETURN(STARPU_TEST_SKIPPED);
			
 
				 }
			
--- a/tools/starpu_perfmodel_plot.c
+++ b/tools/starpu_perfmodel_plot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -24,6 +24,8 @@
 
				 #include <limits.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <starpu_perfmodel.h>
			
 
				+#include <starpu_fxt.h>
			
 
				 #include <core/perfmodel/perfmodel.h> // we need to browse the list associated to history-based models
			
 
				 
			
 
				 #ifdef __MINGW32__
			
@@ -72,7 +74,6 @@ given perfmodel\n");
 
				 	fprintf(stderr, "   -h, --help          display this help and exit\n");
			
 
				 	fprintf(stderr, "   -v, --version       output version information and exit\n\n");
			
 
				         fprintf(stderr, "Report bugs to <%s>.", PACKAGE_BUGREPORT);
			
 
				-        fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 static void parse_args(int argc, char **argv)
			
@@ -146,14 +147,6 @@ static void parse_args(int argc, char **argv)
 
				 			continue;
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	if (!symbol && !list)
			
 
				-	{
			
 
				-		fprintf(stderr, "Incorrect usage, aborting\n");
			
 
				-                usage(argv);
			
 
				-		exit(-1);
			
 
				-	}
			
 
				-
			
 
				 }
			
 
				 
			
 
				 static void print_comma(FILE *gnuplot_file, int *first)
			
@@ -172,7 +165,7 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 
				 	char arch_name[256];
			
 
				 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
			
 
				 
			
 
				-	struct starpu_perfmodel_per_arch *arch_model =
			
 
				+	struct starpu_per_arch_perfmodel *arch_model =
			
 
				 		&model->per_arch[arch][nimpl];
			
 
				 
			
 
				 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
			
@@ -218,7 +211,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 	char *command;
			
 
				 	FILE *datafile;
			
 
				 	unsigned arch;
			
 
				-	struct starpu_perfmodel_history_list *ptr;
			
 
				+	struct starpu_history_list *ptr;
			
 
				 	char archname[32];
			
 
				 	int col;
			
 
				 	int len;
			
@@ -233,7 +226,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 	unsigned implid;
			
 
				 	for (arch = arch1; arch < arch2; arch++) {
			
 
				 		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
			
 
				-			struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				+			struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
			
 
				 			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, implid);
			
 
				 
			
 
				 			//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
			
@@ -253,7 +246,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 		/* Get the next minimum */
			
 
				 		for (arch = arch1; arch < arch2; arch++)
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
			
 
				-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				+				struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
			
 
				 				for (ptr = arch_model->list; ptr; ptr = ptr->next) {
			
 
				 					unsigned long size = ptr->entry->size;
			
 
				 					if (size > last && size < minimum)
			
@@ -267,9 +260,9 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 
				 		fprintf(datafile, "%-15lu ", minimum);
			
 
				 		for (arch = arch1; arch < arch2; arch++) {
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
			
 
				-				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
			
 
				+				struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
			
 
				 				for (ptr = arch_model->list; ptr; ptr = ptr->next) {
			
 
				-					struct starpu_perfmodel_history_entry *entry = ptr->entry;
			
 
				+					struct starpu_history_entry *entry = ptr->entry;
			
 
				 					if (entry->size == minimum) {
			
 
				 						fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
			
 
				 						break;
			
@@ -412,7 +405,7 @@ int main(int argc, char **argv)
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				         if (list) {
			
 
				-                int ret = starpu_perfmodel_list(stdout);
			
 
				+                int ret = starpu_list_models(stdout);
			
 
				                 if (ret) {
			
 
				                         fprintf(stderr, "The performance model directory is invalid\n");
			
 
				                         return 1;
			
@@ -420,11 +413,18 @@ int main(int argc, char **argv)
 
				 		return 0;
			
 
				         }
			
 
				 
			
 
				+	/* We need at least a symbol name */
			
 
				+	if (!symbol)
			
 
				+	{
			
 
				+		fprintf(stderr, "No symbol was specified\n");
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				 	/* Load the performance model associated to the symbol */
			
 
				-	ret = starpu_perfmodel_load_symbol(symbol, &model);
			
 
				+	ret = starpu_load_history_debug(symbol, &model);
			
 
				 	if (ret == 1)
			
 
				 	{
			
 
				-		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", symbol);
			
 
				+		fprintf(stderr, "The performance model could not be loaded\n");
			
 
				 		return 1;
			
 
				 	}