лет назад: 13 · 61172af3df
--- a/ChangeLog
+++ b/ChangeLog
@@ -23,14 +23,41 @@ New features:
 
				   * Capability to load compiled OpenCL kernels
			
 
				   * Performance models measurements can now be provided explicitly by
			
 
				     applications.
			
 
				+  * Capability to emit communication statistics when running MPI code
			
 
				+  * Add starpu_block_shadow_filter_func_vector and an example.
			
 
				+  * Add starpu_unregister_submit, starpu_data_acquire_on_node and
			
 
				+    starpu_data_invalidate_submit
			
 
				+  * New functionnality to wrapper starpu_insert_task to pass a array of
			
 
				+	data_handles via the parameter STARPU_DATA_ARRAY
			
 
				+  * Enable GPU-GPU direct transfers.
			
 
				 
			
 
				 Changes:
			
 
				   * The FxT code can now be used on systems other than Linux.
			
 
				+  * Keep only one hashtable implementation common/uthash.h
			
 
				+  * Add tag dependency in trace-generated DAG.
			
 
				+  * Fix CPU binding for optimized CPU-GPU transfers.
			
 
				+  * The cache of starpu_mpi_insert_task is fixed and thus now enabled by
			
 
				+    default.
			
 
				 
			
 
				 Small changes:
			
 
				   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is
			
 
				 	still available for compatibility reasons.
			
 
				 
			
 
				+StarPU 1.0.1 (svn revision 6659)
			
 
				+==============================================
			
 
				+
			
 
				+Changes:
			
 
				+  * hwloc support. Warn users when hwloc is not found on the system and
			
 
				+	produce error when not explicitely disabled.
			
 
				+  * Several bug fixes
			
 
				+  * GCC plug-in
			
 
				+	- Add `#pragma starpu release'
			
 
				+	- Fix bug when using `acquire' pragma with function parameters
			
 
				+	- Slightly improve test suite coverage
			
 
				+	- Relax the GCC version check
			
 
				+  * Update SOCL to use new API
			
 
				+  * Documentation improvement.
			
 
				+
			
 
				 StarPU 1.0.0 (svn revision 6306)
			
 
				 ==============================================
			
 
				 The extensions-again release
			
--- a/configure.ac
+++ b/configure.ac
@@ -177,6 +177,12 @@ AC_CHECK_FUNC([setenv], [AC_DEFINE([STARPU_HAVE_SETENV], [1], [Define to 1 if th
 
				 # Some systems do not define unsetenv
			
 
				 AC_CHECK_FUNC([unsetenv], [AC_DEFINE([STARPU_HAVE_UNSETENV], [1], [Define to 1 if the function unsetenv is available.])])
			
 
				 
			
 
				+# Some systems do not define nearbyintf...
			
 
				+AC_CHECK_FUNC([nearbyintf], [AC_DEFINE([STARPU_HAVE_NEARBYINTF], [1], [Define to 1 if the function nearbyintf is available.])])
			
 
				+
			
 
				+# ... but they may define rintf.
			
 
				+AC_CHECK_FUNC([rintf], [AC_DEFINE([STARPU_HAVE_RINTF], [1], [Define to 1 if the function rintf is available.])])
			
 
				+
			
 
				 # Define slow machine
			
 
				 AC_ARG_ENABLE(slow-machine, [AS_HELP_STRING([--enable-slow-machine],
			
 
				 				   [Lower default values for the testcases run by make check])],
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -294,6 +294,10 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				+StarPU provides various interfaces and filters for matrices, vectors, etc.,
			
 
				+but applications can also write their own data interfaces and filters, see
			
 
				+@code{examples/interface} and @code{examples/filters/custom_mf} for an example.
			
 
				+
			
 
				 @node Performance model example
			
 
				 @section Performance model example
			
 
				 
			
@@ -486,12 +490,17 @@ The arguments following the codelets can be of the following types:
 
				 @item
			
 
				 @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				 @item
			
 
				+@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
			
 
				+@item
			
 
				 the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				 @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				 @code{STARPU_PRIORITY}, followed by the appropriated objects as
			
 
				 defined below.
			
 
				 @end itemize
			
 
				 
			
 
				+When using @code{STARPU_DATA_ARRAY}, the access mode of the data
			
 
				+handles is not defined.
			
 
				+
			
 
				 Parameters to be passed to the codelet implementation are defined
			
 
				 through the type @code{STARPU_VALUE}. The function
			
 
				 @code{starpu_codelet_unpack_args} must be called within the codelet
			
@@ -591,6 +600,16 @@ task->cl_arg_size = arg_buffer_size;
 
				 int ret = starpu_task_submit(task);
			
 
				 @end smallexample
			
 
				 
			
 
				+Here a similar call using @code{STARPU_DATA_ARRAY}.
			
 
				+
			
 
				+@smallexample
			
 
				+starpu_insert_task(&mycodelet,
			
 
				+                   STARPU_DATA_ARRAY, data_handles, 2,
			
 
				+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                   0);
			
 
				+@end smallexample
			
 
				+
			
 
				 If some part of the task insertion depends on the value of some computation,
			
 
				 the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
			
 
				 instance, assuming that the index variable @code{i} was registered as handle
			
@@ -711,7 +730,7 @@ and destroy it on unregistration.
 
				 
			
 
				 In addition to that, it can be tedious for the application to have to unregister
			
 
				 the data, since it will not use its content anyway. The unregistration can be
			
 
				-done lazily by using the @code{starpu_data_unregister_lazy(handle)} function,
			
 
				+done lazily by using the @code{starpu_data_unregister_submit(handle)} function,
			
 
				 which will record that no more tasks accessing the handle will be submitted, so
			
 
				 that it can be freed as soon as the last task accessing it is over.
			
 
				 
			
@@ -724,7 +743,7 @@ starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
 
				 starpu_insert_task(&produce_data, STARPU_W, handle, 0);
			
 
				 starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
			
 
				 starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				-starpu_data_unregister_lazy(handle);
			
 
				+starpu_data_unregister_submit(handle);
			
 
				 @end smallexample
			
 
				 
			
 
				 @subsection Scratch data
			
@@ -1010,7 +1029,8 @@ constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
 
				 to be the one that runs CUDA computations for that GPU.
			
 
				 
			
 
				 To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
			
 
				-to @code{./configure} (TODO: make it dynamic), the interoperability mode has to
			
 
				+to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
			
 
				+first, and the interoperability mode has to
			
 
				 be enabled by using the @code{cuda_opengl_interoperability} field of the
			
 
				 @code{starpu_conf} structure, and the driver loop has to be run by
			
 
				 the application, by using the @code{not_launched_drivers} field of
			
@@ -1026,14 +1046,24 @@ the CUDA pointer at registration, for instance:
 
				 
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				+/* Get the CUDA worker id */
			
 
				 for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				                 break;
			
 
				 
			
 
				+/* Build a CUDA pointer pointing at the OpenGL buffer */
			
 
				 cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				+
			
 
				+/* And register it to StarPU */
			
 
				 starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				+/* The handle can now be used as usual */
			
 
				 starpu_insert_task(&cl, STARPU_RW, handle, 0);
			
 
				+
			
 
				+/* ... */
			
 
				+
			
 
				+/* This gets back data into the OpenGL buffer */
			
 
				+starpu_data_unregister(handle);
			
 
				 @end smallexample
			
 
				 @end cartouche
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -123,8 +123,16 @@ contains the logical identifiers of the OpenCL devices to be used.
 
				 
			
 
				 @item @code{int calibrate} (default = 0)
			
 
				 If this flag is set, StarPU will calibrate the performance models when
			
 
				-executing tasks. If this value is equal to -1, the default value is used. This
			
 
				-can also be specified with the @code{STARPU_CALIBRATE} environment variable.
			
 
				+executing tasks. If this value is equal to @code{-1}, the default value is
			
 
				+used. If the value is equal to @code{1}, it will force continuing
			
 
				+calibration. If the value is equal to @code{2}, the existing performance
			
 
				+models will be overwritten. This can also be specified with the
			
 
				+@code{STARPU_CALIBRATE} environment variable.
			
 
				+
			
 
				+@item @code{int bus_calibrate} (default = 0)
			
 
				+If this flag is set, StarPU will recalibrate the bus.  If this value is equal
			
 
				+to @code{-1}, the default value is used.  This can also be specified with the
			
 
				+@code{STARPU_BUS_CALIBRATE} environment variable.
			
 
				 
			
 
				 @item @code{int single_combined_worker} (default = 0)
			
 
				 By default, StarPU executes parallel tasks concurrently.
			
@@ -165,7 +173,7 @@ with the default values. In case some configuration parameters are already
 
				 specified through environment variables, @code{starpu_conf_init} initializes
			
 
				 the fields of the structure according to the environment variables. For
			
 
				 instance if @code{STARPU_CALIBRATE} is set, its value is put in the
			
 
				-@code{.ncuda} field of the structure passed as argument.
			
 
				+@code{.calibrate} field of the structure passed as argument.
			
 
				 
			
 
				 Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
			
 
				 indicates that the argument was NULL.
			
@@ -201,8 +209,8 @@ StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun int starpu_worker_get_count_by_type ({enum starpu_archtype} @var{type})
			
 
				-Returns the number of workers of the given type indicated by the argument. A positive
			
 
				-(or null) value is returned in case of success, @code{-EINVAL} indicates that
			
 
				+Returns the number of workers of the given @var{type}. A positive
			
 
				+(or @code{NULL}) value is returned in case of success, @code{-EINVAL} indicates that
			
 
				 the type is not valid otherwise.
			
 
				 @end deftypefun
			
 
				 
			
@@ -431,6 +439,10 @@ access to the handle must be performed in write-only mode. Accessing an
 
				 invalidated data in read-mode results in undefined behaviour.
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun void starpu_data_invalidate_submit (starpu_data_handle_t @var{handle})
			
 
				+Submits invalidation of the data handle after completion of previously submitted tasks.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @c TODO create a specific sections about user interaction with the DSM ?
			
 
				 
			
 
				 @deftypefun void starpu_data_set_wt_mask (starpu_data_handle_t @var{handle}, uint32_t @var{wt_mask})
			
@@ -486,7 +498,7 @@ be consistent with the access mode specified in the @var{mode} argument.
 
				 access the piece of data anymore.  Note that implicit data
			
 
				 dependencies are also enforced by @code{starpu_data_acquire}, i.e.
			
 
				 @code{starpu_data_acquire} will wait for all tasks scheduled to work on
			
 
				-the data, unless that they have not been disabled explictly by calling
			
 
				+the data, unless they have been disabled explictly by calling
			
 
				 @code{starpu_data_set_default_sequential_consistency_flag} or
			
 
				 @code{starpu_data_set_sequential_consistency_flag}.
			
 
				 @code{starpu_data_acquire} is a blocking call, so that it cannot be called from
			
@@ -497,18 +509,28 @@ tasks or from their callbacks (in that case, @code{starpu_data_acquire} returns
 
				 
			
 
				 @deftypefun int starpu_data_acquire_cb (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
			
 
				 @code{starpu_data_acquire_cb} is the asynchronous equivalent of
			
 
				-@code{starpu_data_release}. When the data specified in the first argument is
			
 
				+@code{starpu_data_acquire}. When the data specified in the first argument is
			
 
				 available in the appropriate access mode, the callback function is executed.
			
 
				 The application may access the requested data during the execution of this
			
 
				 callback. The callback function must call @code{starpu_data_release} once the
			
 
				 application does not need to access the piece of data anymore.
			
 
				 Note that implicit data dependencies are also enforced by
			
 
				-@code{starpu_data_acquire_cb} in case they are enabled.
			
 
				+@code{starpu_data_acquire_cb} in case they are not disabled.
			
 
				  Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
			
 
				 be called from task callbacks. Upon successful completion, this function
			
 
				 returns 0.
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun int starpu_data_acquire_on_node (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode})
			
 
				+This is the same as @code{starpu_data_acquire}, except that the data will be
			
 
				+available on the given memory node instead of main memory.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun int starpu_data_acquire_on_node_cb (starpu_data_handle_t @var{handle}, unsigned @var{node}, {enum starpu_access_mode} @var{mode}, void (*@var{callback})(void *), void *@var{arg})
			
 
				+This is the same as @code{starpu_data_acquire_cb}, except that the data will be
			
 
				+available on the given memory node instead of main memory.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @defmac STARPU_DATA_ACQUIRE_CB (starpu_data_handle_t @var{handle}, {enum starpu_access_mode} @var{mode}, code)
			
 
				 @code{STARPU_DATA_ACQUIRE_CB} is the same as @code{starpu_data_acquire_cb},
			
 
				 except that the code to be executed in a callback is directly provided as a
			
@@ -1054,7 +1076,8 @@ starpu_data_partition(A_handle, &f);
 
				 
			
 
				 @deftypefun void starpu_data_unpartition (starpu_data_handle_t @var{root_data}, uint32_t @var{gathering_node})
			
 
				 This unapplies one filter, thus unpartitioning the data. The pieces of data are
			
 
				-collected back into one big piece in the @var{gathering_node} (usually 0).
			
 
				+collected back into one big piece in the @var{gathering_node} (usually 0). Tasks
			
 
				+working on the partitioned data must be already finished when calling @code{starpu_data_unpartition}.
			
 
				 @cartouche
			
 
				 @smallexample
			
 
				 starpu_data_unpartition(A_handle, 0);
			
@@ -1277,7 +1300,9 @@ unset, its value will be automatically set based on the availability
 
				 of the @code{XXX_funcs} fields defined below.
			
 
				 
			
 
				 @item @code{int (*can_execute)(unsigned workerid, struct starpu_task *task, unsigned nimpl)} (optional)
			
 
				-Defines a function which should return 1 if the worker designated by @var{workerid} can execute the @var{nimpl}th implementation of the given@var{task}, 0 otherwise.
			
 
				+Defines a function which should return 1 if the worker designated by
			
 
				+@var{workerid} can execute the @var{nimpl}th implementation of the
			
 
				+given @var{task}, 0 otherwise.
			
 
				 
			
 
				 @item @code{enum starpu_codelet_type type} (optional)
			
 
				 The default is @code{STARPU_SEQ}, i.e. usual sequential implementation. Other
			
@@ -1945,10 +1970,6 @@ returns the path to the debugging information for the performance model.
 
				 returns the architecture name for @var{arch}.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_force_bus_sampling (void)
			
 
				-forces sampling the bus performance model again.
			
 
				-@end deftypefun
			
 
				-
			
 
				 @deftypefun {enum starpu_perf_archtype} starpu_worker_get_perf_archtype (int @var{workerid})
			
 
				 returns the architecture type of a given worker.
			
 
				 @end deftypefun
			
--- a/doc/chapters/configuration.texi
+++ b/doc/chapters/configuration.texi
@@ -19,6 +19,7 @@ The following arguments can be given to the @code{configure} script.
 
				 @menu
			
 
				 * Common configuration::        
			
 
				 * Configuring workers::         
			
 
				+* Extension configuration::     
			
 
				 * Advanced configuration::      
			
 
				 @end menu
			
 
				 
			
@@ -124,6 +125,33 @@ This information is then available as the
 
				 
			
 
				 @end table
			
 
				 
			
 
				+@node Extension configuration
			
 
				+@subsection Extension configuration
			
 
				+
			
 
				+@table @code
			
 
				+
			
 
				+@item --disable-socl
			
 
				+Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
			
 
				+default, it is enabled when an OpenCL implementation is found.
			
 
				+
			
 
				+@item --disable-starpu-top
			
 
				+Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
			
 
				+is enabled when the required dependencies are found.
			
 
				+
			
 
				+@item --disable-gcc-extensions
			
 
				+Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
			
 
				+enabled when the GCC compiler provides a plug-in support.
			
 
				+
			
 
				+@item --with-mpicc=@var{path}
			
 
				+Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				+(@pxref{StarPU MPI support}).
			
 
				+
			
 
				+@item --enable-comm-stats
			
 
				+Enable communication statistics for starpumpi (@pxref{StarPU MPI
			
 
				+support}).
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				 @node Advanced configuration
			
 
				 @subsection Advanced configuration
			
 
				 
			
@@ -173,10 +201,6 @@ notably contain @code{include/fxt/fxt.h}.
 
				 Store performance models under @var{dir}, instead of the current user's
			
 
				 home.
			
 
				 
			
 
				-@item --with-mpicc=@var{path}
			
 
				-Use the @command{mpicc} compiler at @var{path}, for starpumpi
			
 
				-(@pxref{StarPU MPI support}).
			
 
				-
			
 
				 @item --with-goto-dir=@var{prefix}
			
 
				 Search for GotoBLAS under @var{prefix}.
			
 
				 
			
@@ -193,18 +217,6 @@ that the
 
				 @url{http://software.intel.com/en-us/articles/intel-mkl-link-line-advisor/,
			
 
				 MKL website} provides a script to determine the linking flags.
			
 
				 
			
 
				-@item --disable-gcc-extensions
			
 
				-Disable the GCC plug-in (@pxref{C Extensions}).  By default, it is
			
 
				-enabled when the GCC compiler provides a plug-in support.
			
 
				-
			
 
				-@item --disable-socl
			
 
				-Disable the SOCL extension (@pxref{SOCL OpenCL Extensions}).  By
			
 
				-default, it is enabled when an OpenCL implementation is found.
			
 
				-
			
 
				-@item --disable-starpu-top
			
 
				-Disable the StarPU-Top interface (@pxref{StarPU-Top}).  By default, it
			
 
				-is enabled when the required dependencies are found.
			
 
				-
			
 
				 @item --enable-sched-ctx-hypervisor
			
 
				 Enables the Scheduling Context Hypervisor plugin(@pxref{Scheduling Context Hypervisor}). 
			
 
				 By default, it is disabled.
			
@@ -340,6 +352,7 @@ Let the user give a hint to StarPU about which how many workers
 
				 @menu
			
 
				 * STARPU_SCHED::                Scheduling policy
			
 
				 * STARPU_CALIBRATE::            Calibrate performance models
			
 
				+* STARPU_BUS_CALIBRATE::        Calibrate bus
			
 
				 * STARPU_PREFETCH::             Use data prefetch
			
 
				 * STARPU_SCHED_ALPHA::          Computation factor
			
 
				 * STARPU_SCHED_BETA::           Communication factor
			
@@ -363,6 +376,11 @@ is the default behaviour.
 
				 
			
 
				 Note: this currently only applies to @code{dm}, @code{dmda} and @code{heft} scheduling policies.
			
 
				 
			
 
				+@node STARPU_BUS_CALIBRATE
			
 
				+@subsubsection @code{STARPU_BUS_CALIBRATE} -- Calibrate bus
			
 
				+
			
 
				+If this variable is set to 1, the bus is recalibrated during intialization.
			
 
				+
			
 
				 @node STARPU_PREFETCH
			
 
				 @subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
			
 
				 
			
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -80,6 +80,42 @@ In the same vein, accumulation of results in the same data can become a
 
				 bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
			
 
				 accumulation (@pxref{Data reduction}).
			
 
				 
			
 
				+Applications often need a data just for temporary results.  In such a case,
			
 
				+registration can be made without an initial value, for instance this produces a vector data:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+StarPU will then allocate the actual buffer only when it is actually needed,
			
 
				+e.g. directly on the GPU without allocating in main memory.
			
 
				+
			
 
				+In the same vein, once the temporary results are not useful any more, the
			
 
				+data should be thrown away. If the handle is not to be reused, it can be
			
 
				+unregistered:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_unregister_submit(handle);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+actual unregistration will be done after all tasks working on the handle
			
 
				+terminate.
			
 
				+
			
 
				+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
			
 
				+
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				+starpu_invalidate_submit(handle);
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				+
			
 
				+the buffers containing the current value will then be freed, and reallocated
			
 
				+only when another task writes some value to the handle.
			
 
				+
			
 
				 @node Task granularity
			
 
				 @section Task granularity
			
 
				 
			
--- a/examples/basic_examples/vector_scal.c
+++ b/examples/basic_examples/vector_scal.c
@@ -23,6 +23,7 @@
 
				  *  3- how a kernel can manipulate the data (buffers[0].vector.ptr)
			
 
				  */
			
 
				 
			
 
				+#include <config.h>
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_opencl.h>
			
 
				 #include <stdlib.h>
			
@@ -88,8 +89,15 @@ struct starpu_opencl_program opencl_program;
 
				 
			
 
				 static int approximately_equal(float a, float b)
			
 
				 {
			
 
				+#ifdef STARPU_HAVE_NEARBYINTF
			
 
				 	int ai = (int) nearbyintf(a * 1000.0);
			
 
				 	int bi = (int) nearbyintf(b * 1000.0);
			
 
				+#elif defined(STARPU_HAVE_RINTF)
			
 
				+	int ai = (int) rintf(a * 1000.0);
			
 
				+	int bi = (int) rintf(b * 1000.0);
			
 
				+#else
			
 
				+#error "Please define either nearbyintf or rintf."
			
 
				+#endif
			
 
				 	return ai == bi;
			
 
				 }
			
 
				 
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -113,6 +113,9 @@ struct starpu_conf
 
				 	unsigned use_explicit_workers_opencl_gpuid;
			
 
				 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* calibrate bus (-1 for default) */
			
 
				+	int bus_calibrate;
			
 
				+
			
 
				 	/* calibrate performance models, if any (-1 for default) */
			
 
				 	int calibrate;
			
 
				 
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -55,16 +55,20 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle);
 
				 /* Destroy the data handle once it is not needed anymore by any submitted task.
			
 
				  * No coherency is assumed.
			
 
				  */
			
 
				-void starpu_data_unregister_lazy(starpu_data_handle_t handle);
			
 
				+void starpu_data_unregister_submit(starpu_data_handle_t handle);
			
 
				 
			
 
				 /* Destroy all data replicates. After data invalidation, the first access to
			
 
				  * the handle must be performed in write-only mode. */
			
 
				 void starpu_data_invalidate(starpu_data_handle_t handle);
			
 
				+/* Same, but waits for previous task completion */
			
 
				+void starpu_data_invalidate_submit(starpu_data_handle_t handle);
			
 
				 
			
 
				 void starpu_data_advise_as_important(starpu_data_handle_t handle, unsigned is_important);
			
 
				 
			
 
				 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode);
			
 
				+int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_access_mode mode);
			
 
				 int starpu_data_acquire_cb(starpu_data_handle_t handle, enum starpu_access_mode mode, void (*callback)(void *), void *arg);
			
 
				+int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, enum starpu_access_mode mode, void (*callback)(void *), void *arg);
			
 
				 #ifdef __GCC__
			
 
				 #  define STARPU_DATA_ACQUIRE_CB(handle, mode, code) do \
			
 
				 	{ \						\
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -35,7 +35,7 @@ extern "C"
 
				 
			
 
				 struct starpu_task;
			
 
				 
			
 
				-struct starpu_htbl32_node;
			
 
				+struct starpu_history_table;
			
 
				 struct starpu_history_list;
			
 
				 struct starpu_buffer_descr;
			
 
				 
			
@@ -155,7 +155,7 @@ struct starpu_per_arch_perfmodel
 
				 	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 
			
 
				 	/* internal variables */
			
 
				-	struct starpu_htbl32_node *history;
			
 
				+	struct starpu_history_table *history;
			
 
				 	struct starpu_history_list *list;
			
 
				 	struct starpu_regression_model regression;
			
 
				 #ifdef STARPU_MODEL_DEBUG
			
@@ -211,7 +211,6 @@ int starpu_list_models(FILE *output);
 
				 double starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint);
			
 
				 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
			
 
				 
			
 
				-void starpu_force_bus_sampling(void);
			
 
				 void starpu_bus_print_bandwidth(FILE *f);
			
 
				 void starpu_bus_print_affinity(FILE *f);
			
 
				 
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -249,6 +249,7 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 
				 #define STARPU_EXECUTE_ON_DATA	(1<<10)	/* Used by MPI to define which task is going to execute the codelet */
			
 
				 #define STARPU_HYPERVISOR_TAG	(1<<11)	/* Used to tag a task after whose execution we'll execute  a code */
			
 
				 #define STARPU_HYPERVISOR_FLOPS	(1<<12)	/* Used to specify the number of flops needed to be executed by a task */
			
 
				+#define STARPU_DATA_ARRAY       (1<<13) /* Array of data handles */
			
 
				 
			
 
				 /* Wrapper to create a task. */
			
 
				 int starpu_insert_task(struct starpu_codelet *cl, ...);
			
--- a/mpi/Makefile.am
+++ b/mpi/Makefile.am
@@ -76,7 +76,6 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined
 
				 noinst_HEADERS =					\
			
 
				 	starpu_mpi_private.h				\
			
 
				 	starpu_mpi_fxt.h				\
			
 
				-	starpu_mpi_insert_task_cache.h			\
			
 
				 	starpu_mpi_stats.h
			
 
				 
			
 
				 versincludedir = $(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)
			
@@ -89,7 +88,6 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 
				 	starpu_mpi_helper.c				\
			
 
				 	starpu_mpi_datatype.c				\
			
 
				 	starpu_mpi_insert_task.c			\
			
 
				-	starpu_mpi_insert_task_cache.c			\
			
 
				 	starpu_mpi_collective.c				\
			
 
				 	starpu_mpi_stats.c
			
 
				 
			
--- a/mpi/starpu_mpi.c
+++ b/mpi/starpu_mpi.c
@@ -768,10 +768,6 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 
				 static
			
 
				 int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
			
 
				 {
			
 
				-#ifdef STARPU_COMM_STATS
			
 
				-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats, which slows down a bit\n");
			
 
				-#endif
			
 
				-
			
 
				 	_STARPU_PTHREAD_MUTEX_INIT(&mutex, NULL);
			
 
				 	_STARPU_PTHREAD_COND_INIT(&cond_progression, NULL);
			
 
				 	_STARPU_PTHREAD_COND_INIT(&cond_finished, NULL);
			
@@ -807,7 +803,7 @@ int _starpu_mpi_initialize(int initialize_mpi, int *rank, int *world_size)
 
				 #endif
			
 
				 
			
 
				 	_starpu_mpi_add_sync_point_in_fxt();
			
 
				-	_starpu_mpi_comm_amounts_init();
			
 
				+	_starpu_mpi_comm_amounts_init(MPI_COMM_WORLD);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -824,6 +820,10 @@ int starpu_mpi_initialize_extended(int *rank, int *world_size)
 
				 int starpu_mpi_shutdown(void)
			
 
				 {
			
 
				 	void *value;
			
 
				+	int rank;
			
 
				+
			
 
				+	/* We need to get the  rank before calling MPI_Finalize to pass to _starpu_mpi_comm_amounts_display() */
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 
			
 
				 	/* kill the progression thread */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
@@ -841,7 +841,7 @@ int starpu_mpi_shutdown(void)
 
				 	_starpu_mpi_req_list_delete(detached_requests);
			
 
				 	_starpu_mpi_req_list_delete(new_requests);
			
 
				 
			
 
				-	_starpu_mpi_comm_amounts_display();
			
 
				+	_starpu_mpi_comm_amounts_display(rank);
			
 
				 	_starpu_mpi_comm_amounts_free();
			
 
				 
			
 
				 	return 0;
			
--- a/mpi/starpu_mpi_insert_task.c
+++ b/mpi/starpu_mpi_insert_task.c
@@ -21,38 +21,201 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_data.h>
			
 
				 #include <common/utils.h>
			
 
				-#include <common/htable64.h>
			
 
				+#include <common/uthash.h>
			
 
				 #include <util/starpu_insert_task_utils.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				 
			
 
				 //#define STARPU_MPI_VERBOSE 1
			
 
				 #include <starpu_mpi_private.h>
			
 
				 
			
 
				-/* Whether we are allowed to keep copies of remote data. Does not work
			
 
				- * yet: the sender has to know whether the receiver has it, keeping it
			
 
				- * in an array indexed by node numbers. */
			
 
				-//#define MPI_CACHE 1
			
 
				-#include <starpu_mpi_insert_task_cache.h>
			
 
				+/* Whether we are allowed to keep copies of remote data. */
			
 
				+#define MPI_CACHE 1
			
 
				+
			
 
				+#ifdef MPI_CACHE
			
 
				+struct _starpu_data_entry
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	void *data;
			
 
				+};
			
 
				+
			
 
				+struct _starpu_data_entry **sent_data = NULL;
			
 
				+struct _starpu_data_entry **received_data = NULL;
			
 
				+#endif /* MPI_CACHE */
			
 
				 
			
 
				 static void _starpu_mpi_tables_init()
			
 
				 {
			
 
				+#ifdef MPI_CACHE
			
 
				 	if (sent_data == NULL) {
			
 
				 		int nb_nodes;
			
 
				 		int i;
			
 
				 
			
 
				 		MPI_Comm_size(MPI_COMM_WORLD, &nb_nodes);
			
 
				 		_STARPU_MPI_DEBUG("Initialising htable for cache\n");
			
 
				-		sent_data = malloc(nb_nodes * sizeof(struct starpu_htbl64_node *));
			
 
				+		sent_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				 		for(i=0 ; i<nb_nodes ; i++) sent_data[i] = NULL;
			
 
				-		received_data = malloc(nb_nodes * sizeof(struct starpu_htbl64_node *));
			
 
				+		received_data = malloc(nb_nodes * sizeof(struct _starpu_data_entry *));
			
 
				 		for(i=0 ; i<nb_nodes ; i++) received_data[i] = NULL;
			
 
				 	}
			
 
				+#endif /* MPI_CACHE */
			
 
				+}
			
 
				+
			
 
				+static
			
 
				+int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *dest, size_t *size_on_nodes)
			
 
				+{
			
 
				+	if (data && mode & STARPU_R) {
			
 
				+		struct starpu_data_interface_ops *ops;
			
 
				+		int rank = starpu_data_get_rank(data);
			
 
				+
			
 
				+		ops = data->ops;
			
 
				+		size_on_nodes[rank] += ops->get_size(data);
			
 
				+	}
			
 
				+
			
 
				+	if (mode & STARPU_W) {
			
 
				+		if (!data) {
			
 
				+			/* We don't have anything allocated for this.
			
 
				+			 * The application knows we won't do anything
			
 
				+			 * about this task */
			
 
				+			/* Yes, the app could actually not call
			
 
				+			 * insert_task at all itself, this is just a
			
 
				+			 * safeguard. */
			
 
				+			_STARPU_MPI_DEBUG("oh oh\n");
			
 
				+			_STARPU_MPI_LOG_OUT();
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+		int mpi_rank = starpu_data_get_rank(data);
			
 
				+		if (mpi_rank == me) {
			
 
				+			if (*do_execute == 0) {
			
 
				+				*inconsistent_execute = 1;
			
 
				+			}
			
 
				+			else {
			
 
				+				*do_execute = 1;
			
 
				+			}
			
 
				+		}
			
 
				+		else if (mpi_rank != -1) {
			
 
				+			if (*do_execute == 1) {
			
 
				+				*inconsistent_execute = 1;
			
 
				+			}
			
 
				+			else {
			
 
				+				*do_execute = 0;
			
 
				+				*dest = mpi_rank;
			
 
				+				/* That's the rank which needs the data to be sent to */
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			_STARPU_ERROR("rank invalid\n");
			
 
				+		}
			
 
				+	}
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				-void _starpu_data_deallocate(starpu_data_handle_t data_handle)
			
 
				+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int dest, int do_execute, MPI_Comm comm)
			
 
				 {
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#warning _starpu_data_deallocate not implemented yet
			
 
				+	if (data && mode & STARPU_R) {
			
 
				+		int mpi_rank = starpu_data_get_rank(data);
			
 
				+		int mpi_tag = starpu_data_get_tag(data);
			
 
				+		STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
			
 
				+		/* The task needs to read this data */
			
 
				+		if (do_execute && mpi_rank != me && mpi_rank != -1) {
			
 
				+			/* I will have to execute but I don't have the data, receive */
			
 
				+#ifdef MPI_CACHE
			
 
				+			struct _starpu_data_entry *already_received;
			
 
				+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
			
 
				+			if (already_received == NULL) {
			
 
				+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
			
 
				+				entry->data = data;
			
 
				+				HASH_ADD_PTR(received_data[mpi_rank], data, entry);
			
 
				+			}
			
 
				+			else {
			
 
				+				_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
			
 
				+			}
			
 
				+			if (!already_received)
			
 
				+#endif
			
 
				+			{
			
 
				+				_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
			
 
				+				starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+		}
			
 
				+		if (!do_execute && mpi_rank == me) {
			
 
				+			/* Somebody else will execute it, and I have the data, send it. */
			
 
				+#ifdef MPI_CACHE
			
 
				+			struct _starpu_data_entry *already_sent;
			
 
				+			HASH_FIND_PTR(sent_data[dest], &data, already_sent);
			
 
				+			if (already_sent == NULL) {
			
 
				+				struct _starpu_data_entry *entry = (struct _starpu_data_entry *)malloc(sizeof(*entry));
			
 
				+				entry->data = data;
			
 
				+				HASH_ADD_PTR(sent_data[dest], data, entry);
			
 
				+				_STARPU_MPI_DEBUG("Noting that data %p has already been sent to %d\n", data, dest);
			
 
				+			}
			
 
				+			else {
			
 
				+				_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
			
 
				+			}
			
 
				+			if (!already_sent)
			
 
				+#endif
			
 
				+			{
			
 
				+				_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
			
 
				+				starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int xrank, int dest, int do_execute, MPI_Comm comm)
			
 
				+{
			
 
				+	if (mode & STARPU_W) {
			
 
				+		int mpi_rank = starpu_data_get_rank(data);
			
 
				+		int mpi_tag = starpu_data_get_tag(data);
			
 
				+		STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
			
 
				+		if (mpi_rank == me) {
			
 
				+			if (xrank != -1 && me != xrank) {
			
 
				+				_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
			
 
				+				starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
			
 
				+			}
			
 
				+		}
			
 
				+		else if (do_execute) {
			
 
				+			_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
			
 
				+			starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum starpu_access_mode mode, int me, int do_execute, MPI_Comm comm)
			
 
				+{
			
 
				+#ifdef MPI_CACHE
			
 
				+	if (mode & STARPU_W) {
			
 
				+		if (do_execute) {
			
 
				+			/* Note that all copies I've sent to neighbours are now invalid */
			
 
				+			int n, size;
			
 
				+			MPI_Comm_size(comm, &size);
			
 
				+			for(n=0 ; n<size ; n++) {
			
 
				+				struct _starpu_data_entry *already_sent;
			
 
				+				HASH_FIND_PTR(sent_data[n], &data, already_sent);
			
 
				+				if (already_sent) {
			
 
				+					_STARPU_MPI_DEBUG("Clearing send cache for data %p\n", data);
			
 
				+					HASH_DEL(sent_data[n], already_sent);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		else {
			
 
				+			int mpi_rank = starpu_data_get_rank(data);
			
 
				+			struct _starpu_data_entry *already_received;
			
 
				+			HASH_FIND_PTR(received_data[mpi_rank], &data, already_received);
			
 
				+			if (already_received) {
			
 
				+				/* Somebody else will write to the data, so discard our cached copy if any */
			
 
				+				/* TODO: starpu_mpi could just remember itself. */
			
 
				+				_STARPU_MPI_DEBUG("Clearing receive cache for data %p\n", data);
			
 
				+				HASH_DEL(received_data[mpi_rank], already_received);
			
 
				+				starpu_data_invalidate_submit(data);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+#else
			
 
				+	/* We allocated a temporary buffer for the received data, now drop it */
			
 
				+	if ((mode & STARPU_R) && do_execute) {
			
 
				+		int mpi_rank = starpu_data_get_rank(data);
			
 
				+		if (mpi_rank != me && mpi_rank != -1) {
			
 
				+			starpu_data_invalidate_submit(data);
			
 
				+		}
			
 
				+	}
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -65,6 +228,7 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 	size_t arg_buffer_size = 0;
			
 
				 	char *arg_buffer;
			
 
				 	int dest=0, inconsistent_execute;
			
 
				+	int current_data = 0;
			
 
				 
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
@@ -102,49 +266,30 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 		}
			
 
				 		else if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
			
 
				 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
			
 
				-
			
 
				-			if (data && arg_type & STARPU_R) {
			
 
				-				int rank = starpu_data_get_rank(data);
			
 
				-				struct starpu_data_interface_ops *ops;
			
 
				-				ops = data->ops;
			
 
				-				size_on_nodes[rank] += ops->get_size(data);
			
 
				+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
			
 
				+			int ret = _starpu_mpi_find_executee_node(data, mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
			
 
				+			if (ret == -EINVAL)
			
 
				+			{
			
 
				+				free(size_on_nodes);
			
 
				+				return ret;
			
 
				 			}
			
 
				-
			
 
				-			if (arg_type & STARPU_W) {
			
 
				-				if (!data) {
			
 
				-					/* We don't have anything allocated for this.
			
 
				-					 * The application knows we won't do anything
			
 
				-					 * about this task */
			
 
				-					/* Yes, the app could actually not call
			
 
				-					 * insert_task at all itself, this is just a
			
 
				-					 * safeguard. */
			
 
				-					_STARPU_MPI_DEBUG("oh oh\n");
			
 
				-					_STARPU_MPI_LOG_OUT();
			
 
				+			current_data ++;
			
 
				+		}
			
 
				+		else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
			
 
				+			int nb_handles = va_arg(varg_list, int);
			
 
				+			int i;
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				enum starpu_access_mode mode = codelet->modes[current_data];
			
 
				+				int ret = _starpu_mpi_find_executee_node(datas[i], mode, me, &do_execute, &inconsistent_execute, &dest, size_on_nodes);
			
 
				+				if (ret == -EINVAL)
			
 
				+				{
			
 
				 					free(size_on_nodes);
			
 
				-					return -EINVAL;
			
 
				-				}
			
 
				-				int mpi_rank = starpu_data_get_rank(data);
			
 
				-				if (mpi_rank == me) {
			
 
				-					if (do_execute == 0) {
			
 
				-						inconsistent_execute = 1;
			
 
				-					}
			
 
				-					else {
			
 
				-						do_execute = 1;
			
 
				-					}
			
 
				-				}
			
 
				-				else if (mpi_rank != -1) {
			
 
				-					if (do_execute == 1) {
			
 
				-						inconsistent_execute = 1;
			
 
				-					}
			
 
				-					else {
			
 
				-						do_execute = 0;
			
 
				-						dest = mpi_rank;
			
 
				-						/* That's the rank which needs the data to be sent to */
			
 
				-					}
			
 
				-				}
			
 
				-				else {
			
 
				-					_STARPU_ERROR("rank invalid\n");
			
 
				+					return ret;
			
 
				 				}
			
 
				+				current_data ++;
			
 
				 			}
			
 
				 		}
			
 
				 		else if (arg_type==STARPU_VALUE) {
			
@@ -210,49 +355,26 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 
			
 
				 	/* Send and receive data as requested */
			
 
				 	va_start(varg_list, codelet);
			
 
				+	current_data = 0;
			
 
				 	while ((arg_type = va_arg(varg_list, int)) != 0) {
			
 
				-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
			
 
				+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
			
 
				 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
			
 
				-			if (data && arg_type & STARPU_R) {
			
 
				-				int mpi_rank = starpu_data_get_rank(data);
			
 
				-				int mpi_tag = starpu_data_get_tag(data);
			
 
				-				STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
			
 
				-				/* The task needs to read this data */
			
 
				-				if (do_execute && mpi_rank != me && mpi_rank != -1) {
			
 
				-					/* I will have to execute but I don't have the data, receive */
			
 
				-#ifdef MPI_CACHE
			
 
				-					void *already_received = _starpu_htbl_search_64(received_data[mpi_rank], (uintptr_t) data);
			
 
				-					if (!already_received) {
			
 
				-						_starpu_htbl_insert_64(&received_data[mpi_rank], (uintptr_t) data, data);
			
 
				-					}
			
 
				-					else {
			
 
				-						_STARPU_MPI_DEBUG("Do not receive data %p from node %d as it is already available\n", data, mpi_rank);
			
 
				-					}
			
 
				-					if (!already_received)
			
 
				-#endif
			
 
				-					{
			
 
				-						_STARPU_MPI_DEBUG("Receive data %p from %d\n", data, mpi_rank);
			
 
				-						starpu_mpi_irecv_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
			
 
				-					}
			
 
				-				}
			
 
				-				if (!do_execute && mpi_rank == me) {
			
 
				-					/* Somebody else will execute it, and I have the data, send it. */
			
 
				-#ifdef MPI_CACHE
			
 
				-					void *already_sent = _starpu_htbl_search_64(sent_data[dest], (uintptr_t) data);
			
 
				-
			
 
				-					if (!already_sent) {
			
 
				-						_starpu_htbl_insert_64(&sent_data[dest], (uintptr_t) data, data);
			
 
				-					}
			
 
				-					else {
			
 
				-						_STARPU_MPI_DEBUG("Do not send data %p to node %d as it has already been sent\n", data, dest);
			
 
				-					}
			
 
				-					if (!already_sent)
			
 
				-#endif
			
 
				-					{
			
 
				-						_STARPU_MPI_DEBUG("Send data %p to %d\n", data, dest);
			
 
				-						starpu_mpi_isend_detached(data, dest, mpi_tag, comm, NULL, NULL);
			
 
				-					}
			
 
				-				}
			
 
				+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
			
 
				+
			
 
				+			_starpu_mpi_exchange_data_before_execution(data, mode, me, dest, do_execute, comm);
			
 
				+			current_data ++;
			
 
				+
			
 
				+		}
			
 
				+		else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
			
 
				+			int nb_handles = va_arg(varg_list, int);
			
 
				+			int i;
			
 
				+
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				_starpu_mpi_exchange_data_before_execution(datas[i], codelet->modes[current_data], me, dest, do_execute, comm);
			
 
				+				current_data++;
			
 
				 			}
			
 
				 		}
			
 
				 		else if (arg_type==STARPU_VALUE) {
			
@@ -285,30 +407,32 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 		_STARPU_MPI_DEBUG("Execution of the codelet %p (%s)\n", codelet, codelet->name);
			
 
				 		va_start(varg_list, codelet);
			
 
				 		struct starpu_task *task = starpu_task_create();
			
 
				-		int ret = _starpu_insert_task_create_and_submit(arg_buffer, codelet, &task, varg_list);
			
 
				+		int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, codelet, &task, varg_list);
			
 
				 		_STARPU_MPI_DEBUG("ret: %d\n", ret);
			
 
				 		STARPU_ASSERT(ret==0);
			
 
				 	}
			
 
				 
			
 
				 	if (inconsistent_execute) {
			
 
				 		va_start(varg_list, codelet);
			
 
				+		current_data = 0;
			
 
				 		while ((arg_type = va_arg(varg_list, int)) != 0) {
			
 
				-			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
			
 
				+			if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type==STARPU_SCRATCH || arg_type==STARPU_REDUX) {
			
 
				 				starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
			
 
				-				if (arg_type & STARPU_W) {
			
 
				-					int mpi_rank = starpu_data_get_rank(data);
			
 
				-					int mpi_tag = starpu_data_get_tag(data);
			
 
				-					STARPU_ASSERT(mpi_tag >= 0 && "StarPU needs to be told the MPI rank of this data, using starpu_data_set_rank");
			
 
				-					if (mpi_rank == me) {
			
 
				-						if (xrank != -1 && me != xrank) {
			
 
				-							_STARPU_MPI_DEBUG("Receive data %p back from the task %d which executed the codelet ...\n", data, dest);
			
 
				-							starpu_mpi_irecv_detached(data, dest, mpi_tag, comm, NULL, NULL);
			
 
				-						}
			
 
				-					}
			
 
				-					else if (do_execute) {
			
 
				-						_STARPU_MPI_DEBUG("Send data %p back to its owner %d...\n", data, mpi_rank);
			
 
				-						starpu_mpi_isend_detached(data, mpi_rank, mpi_tag, comm, NULL, NULL);
			
 
				-					}
			
 
				+				enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
			
 
				+
			
 
				+				_starpu_mpi_exchange_data_after_execution(data, mode, me, xrank, dest, do_execute, comm);
			
 
				+				current_data++;
			
 
				+			}
			
 
				+			else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+			{
			
 
				+				starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
			
 
				+				int nb_handles = va_arg(varg_list, int);
			
 
				+				int i;
			
 
				+
			
 
				+				for(i=0 ; i<nb_handles ; i++)
			
 
				+				{
			
 
				+					_starpu_mpi_exchange_data_after_execution(datas[i], codelet->modes[current_data], me, xrank, dest, do_execute, comm);
			
 
				+					current_data++;
			
 
				 				}
			
 
				 			}
			
 
				 			else if (arg_type==STARPU_VALUE) {
			
@@ -339,45 +463,26 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 	}
			
 
				 
			
 
				 	va_start(varg_list, codelet);
			
 
				+	current_data = 0;
			
 
				 	while ((arg_type = va_arg(varg_list, int)) != 0) {
			
 
				-		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH) {
			
 
				+		if (arg_type==STARPU_R || arg_type==STARPU_W || arg_type==STARPU_RW || arg_type == STARPU_SCRATCH || arg_type == STARPU_REDUX) {
			
 
				 			starpu_data_handle_t data = va_arg(varg_list, starpu_data_handle_t);
			
 
				-#ifdef MPI_CACHE
			
 
				-			if (arg_type & STARPU_W) {
			
 
				-				if (do_execute) {
			
 
				-					/* Note that all copies I've sent to neighbours are now invalid */
			
 
				-					int n, size;
			
 
				-					MPI_Comm_size(comm, &size);
			
 
				-					for(n=0 ; n<size ; n++) {
			
 
				-						void *already_sent = _starpu_htbl_search_64(sent_data[n], (uintptr_t) data);
			
 
				-
			
 
				-						if (already_sent) {
			
 
				-							_STARPU_MPI_DEBUG("Posting request to clear send cache for data %p\n", data);
			
 
				-							_starpu_mpi_clear_cache_request(data, n, _STARPU_MPI_CLEAR_SENT_DATA);
			
 
				-						}
			
 
				-					}
			
 
				-				}
			
 
				-				else {
			
 
				-					int mpi_rank = starpu_data_get_rank(data);
			
 
				-					void *already_received=_starpu_htbl_search_64(received_data[mpi_rank], (uintptr_t) data);
			
 
				-					if (already_received) {
			
 
				-						/* Somebody else will write to the data, so discard our cached copy if any */
			
 
				-						/* TODO: starpu_mpi could just remember itself. */
			
 
				-						_STARPU_MPI_DEBUG("Posting request to clear receive cache for data %p\n", data);
			
 
				-						_starpu_mpi_clear_cache_request(data, mpi_rank, _STARPU_MPI_CLEAR_RECEIVED_DATA);
			
 
				-						_starpu_data_deallocate(data);
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-#else
			
 
				-			/* We allocated a temporary buffer for the received data, now drop it */
			
 
				-			if ((arg_type & STARPU_R) && do_execute) {
			
 
				-				int mpi_rank = starpu_data_get_rank(data);
			
 
				-				if (mpi_rank != me && mpi_rank != -1) {
			
 
				-					_starpu_data_deallocate(data);
			
 
				-				}
			
 
				+			enum starpu_access_mode mode = (enum starpu_access_mode) arg_type;
			
 
				+
			
 
				+			_starpu_mpi_clear_data_after_execution(data, mode, me, do_execute, comm);
			
 
				+			current_data++;
			
 
				+		}
			
 
				+		else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			starpu_data_handle_t *datas = va_arg(varg_list, starpu_data_handle_t *);
			
 
				+			int nb_handles = va_arg(varg_list, int);
			
 
				+			int i;
			
 
				+
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				_starpu_mpi_clear_data_after_execution(datas[i], codelet->modes[current_data], me, do_execute, comm);
			
 
				+				current_data++;
			
 
				 			}
			
 
				-#endif
			
 
				 		}
			
 
				 		else if (arg_type==STARPU_VALUE) {
			
 
				 			va_arg(varg_list, void *);
			
--- a/mpi/starpu_mpi_insert_task_cache.c
+++ b/mpi/starpu_mpi_insert_task_cache.c
@@ -1,93 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu_mpi_private.h>
			
 
				-#include <starpu_mpi_insert_task_cache.h>
			
 
				-#include <common/htable64.h>
			
 
				-
			
 
				-typedef struct _starpu_mpi_clear_cache_s {
			
 
				-        starpu_data_handle_t data;
			
 
				-        int rank;
			
 
				-        int mode;
			
 
				-} _starpu_mpi_clear_cache_t;
			
 
				-
			
 
				-struct starpu_htbl64_node **sent_data = NULL;
			
 
				-struct starpu_htbl64_node **received_data = NULL;
			
 
				-
			
 
				-void _starpu_mpi_clear_cache_callback(void *callback_arg)
			
 
				-{
			
 
				-        _starpu_mpi_clear_cache_t *clear_cache = (_starpu_mpi_clear_cache_t *)callback_arg;
			
 
				-
			
 
				-        if (clear_cache->mode == _STARPU_MPI_CLEAR_SENT_DATA) {
			
 
				-                _STARPU_MPI_DEBUG("Clearing sent cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
			
 
				-                _starpu_htbl_insert_64(&sent_data[clear_cache->rank], (uintptr_t)clear_cache->data, NULL);
			
 
				-        }
			
 
				-        else if (clear_cache->mode == _STARPU_MPI_CLEAR_RECEIVED_DATA) {
			
 
				-                _STARPU_MPI_DEBUG("Clearing received cache for data %p and rank %d\n", clear_cache->data, clear_cache->rank);
			
 
				-                _starpu_htbl_insert_64(&received_data[clear_cache->rank], (uintptr_t)clear_cache->data, NULL);
			
 
				-        }
			
 
				-
			
 
				-        free(clear_cache);
			
 
				-}
			
 
				-
			
 
				-double _starpu_mpi_clear_cache_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static struct starpu_perfmodel _starpu_mpi_clear_cache_model =
			
 
				-{
			
 
				-	.cost_function = _starpu_mpi_clear_cache_cost_function,
			
 
				-	.type = STARPU_COMMON,
			
 
				-};
			
 
				-
			
 
				-static void _starpu_mpi_clear_cache_func(void *descr[] __attribute__ ((unused)), void *arg __attribute__ ((unused)))
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static struct starpu_codelet _starpu_mpi_clear_cache_codelet =
			
 
				-{
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				-	.cpu_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.cuda_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.opencl_funcs = {_starpu_mpi_clear_cache_func, NULL},
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW},
			
 
				-	.model = &_starpu_mpi_clear_cache_model
			
 
				-	// The model has a cost function which returns 0 so as to allow the codelet to be scheduled anywhere
			
 
				-};
			
 
				-
			
 
				-void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode)
			
 
				-{
			
 
				-        struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-	// We have a codelet with a empty function just to force the
			
 
				-	// task being created to have a dependency on data_handle
			
 
				-        task->cl = &_starpu_mpi_clear_cache_codelet;
			
 
				-        task->handles[0] = data_handle;
			
 
				-
			
 
				-        _starpu_mpi_clear_cache_t *clear_cache = malloc(sizeof(_starpu_mpi_clear_cache_t));
			
 
				-        clear_cache->data = data_handle;
			
 
				-        clear_cache->rank = rank;
			
 
				-        clear_cache->mode = mode;
			
 
				-
			
 
				-        task->callback_func = _starpu_mpi_clear_cache_callback;
			
 
				-        task->callback_arg = clear_cache;
			
 
				-        int ret = starpu_task_submit(task);
			
 
				-        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-}
			
 
				-
			
--- a/mpi/starpu_mpi_stats.c
+++ b/mpi/starpu_mpi_stats.c
@@ -22,39 +22,30 @@
 
				 
			
 
				 /* measure the amount of data transfers between each pair of MPI nodes */
			
 
				 #ifdef STARPU_COMM_STATS
			
 
				-static size_t **comm_amount;
			
 
				+static size_t *comm_amount;
			
 
				 static int world_size;
			
 
				 #endif /* STARPU_COMM_STATS */
			
 
				 
			
 
				-void _starpu_mpi_comm_amounts_init()
			
 
				+void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
			
 
				 {
			
 
				 #ifdef STARPU_COMM_STATS
			
 
				-	int i;
			
 
				+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: StarPU was configured with --enable-comm-stats, which slows down a bit\n");
			
 
				 
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				+	MPI_Comm_size(comm, &world_size);
			
 
				 	_STARPU_MPI_DEBUG("allocating for %d nodes\n", world_size);
			
 
				 
			
 
				-	comm_amount = (size_t **) calloc(1, world_size * sizeof(size_t *));
			
 
				-	for(i=0 ; i<world_size ; i++)
			
 
				-	{
			
 
				-		comm_amount[i] = (size_t *) calloc(1, world_size * sizeof(size_t));
			
 
				-	}
			
 
				+	comm_amount = (size_t *) calloc(world_size, sizeof(size_t));
			
 
				 #endif /* STARPU_COMM_STATS */
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_comm_amounts_free()
			
 
				 {
			
 
				 #ifdef STARPU_COMM_STATS
			
 
				-	int i;
			
 
				-	for(i=0 ; i<world_size ; i++)
			
 
				-	{
			
 
				-		free(comm_amount[i]);
			
 
				-	}
			
 
				 	free(comm_amount);
			
 
				 #endif /* STARPU_COMM_STATS */
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)), 
			
 
				+void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)),
			
 
				 				  unsigned dst  __attribute__ ((unused)), MPI_Datatype datatype  __attribute__ ((unused)))
			
 
				 {
			
 
				 #ifdef STARPU_COMM_STATS
			
@@ -63,43 +54,33 @@ void _starpu_mpi_comm_amounts_inc(MPI_Comm comm  __attribute__ ((unused)),
 
				 	MPI_Comm_rank(comm, &src);
			
 
				 	MPI_Type_size(datatype, &size);
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG("adding %d from %d to %d\n", size, src, dst);
			
 
				+	_STARPU_MPI_DEBUG("[%d] adding %d to %d\n", src, size, dst);
			
 
				 
			
 
				-	comm_amount[src][dst] += size;
			
 
				+	comm_amount[dst] += size;
			
 
				 #endif /* STARPU_COMM_STATS */
			
 
				 }
			
 
				 
			
 
				-void _starpu_mpi_comm_amounts_display()
			
 
				+void _starpu_mpi_comm_amounts_display(int node)
			
 
				 {
			
 
				 #ifdef STARPU_COMM_STATS
			
 
				-	unsigned src, dst;
			
 
				-
			
 
				+	unsigned dst;
			
 
				 	size_t sum = 0;
			
 
				 
			
 
				 	for (dst = 0; dst < world_size; dst++)
			
 
				-		for (src = 0; src < world_size; src++)
			
 
				-		{
			
 
				-			sum += comm_amount[src][dst];
			
 
				-		}
			
 
				+	{
			
 
				+		sum += comm_amount[dst];
			
 
				+	}
			
 
				 
			
 
				-	fprintf(stderr, "\nCommunication transfers stats:\nTOTAL transfers %f B\t%f MB\n", (float)sum, (float)sum/1024/1024);
			
 
				+	fprintf(stderr, "\n[%d] Communication transfers stats:\nTOTAL transfers %f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
			
 
				 
			
 
				 	for (dst = 0; dst < world_size; dst++)
			
 
				-		for (src = 0; src < world_size; src++)
			
 
				+	{
			
 
				+		if (comm_amount[dst])
			
 
				 		{
			
 
				-			if (comm_amount[src][dst])
			
 
				-			{
			
 
				-				fprintf(stderr, "\t%d <-> %d\t%f B\t%f MB\n",
			
 
				-					src, dst, (float)comm_amount[src][dst] + (float)comm_amount[dst][src],
			
 
				-					((float)comm_amount[src][dst] + (float)comm_amount[dst][src])/(1024*1024));
			
 
				-				fprintf(stderr, "\t\t%d -> %d\t%f B\t%f MB\n",
			
 
				-					src, dst, (float)comm_amount[src][dst],
			
 
				-					((float)comm_amount[src][dst])/(1024*1024));
			
 
				-				fprintf(stderr, "\t\t%d -> %d\t%f B\t%f MB\n",
			
 
				-					dst, src, (float)comm_amount[dst][src],
			
 
				-					((float)comm_amount[dst][src])/(1024*1024));
			
 
				-			}
			
 
				+			fprintf(stderr, "\t%d -> %d\t%f B\t%f MB\n",
			
 
				+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
			
 
				 		}
			
 
				+	}
			
 
				 #endif /* STARPU_COMM_STATS */
			
 
				 }
			
 
				 
			
--- a/mpi/starpu_mpi_stats.h
+++ b/mpi/starpu_mpi_stats.h
@@ -17,8 +17,8 @@
 
				 #include <stdlib.h>
			
 
				 #include <mpi.h>
			
 
				 
			
 
				-void _starpu_mpi_comm_amounts_init();
			
 
				+void _starpu_mpi_comm_amounts_init(MPI_Comm comm);
			
 
				 void _starpu_mpi_comm_amounts_free();
			
 
				 void _starpu_mpi_comm_amounts_inc(MPI_Comm comm, unsigned dst, MPI_Datatype datatype);
			
 
				-void _starpu_mpi_comm_amounts_display();
			
 
				+void _starpu_mpi_comm_amounts_display(int node);
			
 
				 
			
--- a/mpi/tests/insert_task_cache.c
+++ b/mpi/tests/insert_task_cache.c
@@ -106,12 +106,19 @@ int main(int argc, char **argv)
 
				                 }
			
 
				         }
			
 
				 
			
 
				+	mycodelet.name = "codelet1";
			
 
				         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[1][1], STARPU_R, data_handles[0][1], 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+
			
 
				+	mycodelet.name = "codelet2";
			
 
				         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+
			
 
				+	mycodelet.name = "codelet3";
			
 
				         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[0][1], STARPU_R, data_handles[0][0], 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				+
			
 
				+	mycodelet.name = "codelet4";
			
 
				         ret = starpu_mpi_insert_task(MPI_COMM_WORLD, &mycodelet, STARPU_RW, data_handles[3][1], STARPU_R, data_handles[0][1], 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_insert_task");
			
 
				 
			
--- a/socl/examples/basic/basic.c
+++ b/socl/examples/basic/basic.c
@@ -86,8 +86,10 @@ int main(int UNUSED(argc), char** UNUSED(argv)) {
 
				       if (num_devices != 0)
			
 
				          break;
			
 
				    }
			
 
				-   if (num_devices == 0)
			
 
				-      error("No OpenCL device found\n");
			
 
				+   if (num_devices == 0) {
			
 
				+      printf("No OpenCL device found\n");
			
 
				+      exit(77);
			
 
				+   }
			
 
				 
			
 
				    printf("Creating context...\n");
			
 
				    cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platform_idx], 0};
			
--- a/socl/examples/clinfo/clinfo.c
+++ b/socl/examples/clinfo/clinfo.c
@@ -46,7 +46,6 @@ main(void) {
 
				    err = clGetPlatformIDs(num_platforms, platforms, NULL);
			
 
				    checkErr(err, "Unable to get platform list");
			
 
				 
			
 
				-
			
 
				    // Iteratate over platforms
			
 
				    printf("Number of platforms:\t\t\t\t %d\n", num_platforms);
			
 
				 
			
@@ -92,6 +91,11 @@ main(void) {
 
				 
			
 
				          err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
			
 
				          checkErr(err, "clGetDeviceIds(CL_DEVICE_TYPE_ALL)");
			
 
				+	 if (num_devices == 0) {
			
 
				+	      printf("  No devices found\n");
			
 
				+	      continue;
			
 
				+	 }
			
 
				+
			
 
				          devices = (cl_device_id*)malloc(sizeof(cl_device_id)*num_devices);
			
 
				 
			
 
				          err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
			
--- a/socl/examples/matmul/matmul.c
+++ b/socl/examples/matmul/matmul.c
@@ -240,6 +240,10 @@ int main(int argc, const char** argv) {
 
				 			devs[p] = 0;

			
 
				 			continue;

			
 
				 		}

			
 
				+		if (devs[p] == 0) {

			
 
				+		     printf("No OpenCL device found\n");

			
 
				+		     exit(77);

			
 
				+		}

			
 
				 		check(err);

			
 
				 		if (devs[p] == 0)

			
 
				 			continue;

			
--- a/socl/src/CL/cl.h
+++ b/socl/src/CL/cl.h
--- a/socl/src/CL/cl_d3d10.h
+++ b/socl/src/CL/cl_d3d10.h
@@ -0,0 +1,126 @@
 
				+/**********************************************************************************
			
 
				+ * Copyright (c) 2008-2010 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ **********************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
			
 
				+
			
 
				+#ifndef __OPENCL_CL_D3D10_H
			
 
				+#define __OPENCL_CL_D3D10_H
			
 
				+
			
 
				+#include <d3d10.h>
			
 
				+#include <CL/cl.h>
			
 
				+#include <CL/cl_platform.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/******************************************************************************
			
 
				+ * cl_khr_d3d10_sharing                                                       */
			
 
				+#define cl_khr_d3d10_sharing 1
			
 
				+
			
 
				+typedef cl_uint cl_d3d10_device_source_khr;
			
 
				+typedef cl_uint cl_d3d10_device_set_khr;
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+// Error Codes
			
 
				+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
			
 
				+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
			
 
				+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
			
 
				+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
			
 
				+
			
 
				+// cl_d3d10_device_source_nv
			
 
				+#define CL_D3D10_DEVICE_KHR                          0x4010
			
 
				+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
			
 
				+
			
 
				+// cl_d3d10_device_set_nv
			
 
				+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
			
 
				+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
			
 
				+
			
 
				+// cl_context_info
			
 
				+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
			
 
				+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
			
 
				+
			
 
				+// cl_mem_info
			
 
				+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
			
 
				+
			
 
				+// cl_image_info
			
 
				+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
			
 
				+
			
 
				+// cl_command_type
			
 
				+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
			
 
				+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
			
 
				+    cl_platform_id             platform,
			
 
				+    cl_d3d10_device_source_khr d3d_device_source,
			
 
				+    void *                     d3d_object,
			
 
				+    cl_d3d10_device_set_khr    d3d_device_set,
			
 
				+    cl_uint                    num_entries,
			
 
				+    cl_device_id *             devices,
			
 
				+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
			
 
				+    cl_context     context,
			
 
				+    cl_mem_flags   flags,
			
 
				+    ID3D10Buffer * resource,
			
 
				+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
			
 
				+    cl_context        context,
			
 
				+    cl_mem_flags      flags,
			
 
				+    ID3D10Texture2D * resource,
			
 
				+    UINT              subresource,
			
 
				+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
			
 
				+    cl_context        context,
			
 
				+    cl_mem_flags      flags,
			
 
				+    ID3D10Texture3D * resource,
			
 
				+    UINT              subresource,
			
 
				+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    const cl_mem *   mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    const cl_mem *   mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  // __OPENCL_CL_D3D10_H
			
 
				+
			
--- a/socl/src/CL/cl_d3d11.h
+++ b/socl/src/CL/cl_d3d11.h
@@ -0,0 +1,126 @@
 
				+/**********************************************************************************
			
 
				+ * Copyright (c) 2008-2010 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ **********************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
			
 
				+
			
 
				+#ifndef __OPENCL_CL_D3D11_H
			
 
				+#define __OPENCL_CL_D3D11_H
			
 
				+
			
 
				+#include <d3d11.h>
			
 
				+#include <CL/cl.h>
			
 
				+#include <CL/cl_platform.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/******************************************************************************
			
 
				+ * cl_khr_d3d11_sharing                                                       */
			
 
				+#define cl_khr_d3d11_sharing 1
			
 
				+
			
 
				+typedef cl_uint cl_d3d11_device_source_khr;
			
 
				+typedef cl_uint cl_d3d11_device_set_khr;
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+// Error Codes
			
 
				+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
			
 
				+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
			
 
				+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
			
 
				+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
			
 
				+
			
 
				+// cl_d3d11_device_source
			
 
				+#define CL_D3D11_DEVICE_KHR                          0x4019
			
 
				+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
			
 
				+
			
 
				+// cl_d3d11_device_set
			
 
				+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
			
 
				+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
			
 
				+
			
 
				+// cl_context_info
			
 
				+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
			
 
				+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
			
 
				+
			
 
				+// cl_mem_info
			
 
				+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
			
 
				+
			
 
				+// cl_image_info
			
 
				+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
			
 
				+
			
 
				+// cl_command_type
			
 
				+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
			
 
				+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
			
 
				+    cl_platform_id             platform,
			
 
				+    cl_d3d11_device_source_khr d3d_device_source,
			
 
				+    void *                     d3d_object,
			
 
				+    cl_d3d11_device_set_khr    d3d_device_set,
			
 
				+    cl_uint                    num_entries,
			
 
				+    cl_device_id *             devices,
			
 
				+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
			
 
				+    cl_context     context,
			
 
				+    cl_mem_flags   flags,
			
 
				+    ID3D11Buffer * resource,
			
 
				+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
			
 
				+    cl_context        context,
			
 
				+    cl_mem_flags      flags,
			
 
				+    ID3D11Texture2D * resource,
			
 
				+    UINT              subresource,
			
 
				+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
			
 
				+    cl_context        context,
			
 
				+    cl_mem_flags      flags,
			
 
				+    ID3D11Texture3D * resource,
			
 
				+    UINT              subresource,
			
 
				+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    const cl_mem *   mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    const cl_mem *   mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  // __OPENCL_CL_D3D11_H
			
 
				+
			
--- a/socl/src/CL/cl_dx9_media_sharing.h
+++ b/socl/src/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,127 @@
 
				+/**********************************************************************************
			
 
				+ * Copyright (c) 2008-2012 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ **********************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
			
 
				+
			
 
				+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
			
 
				+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
			
 
				+
			
 
				+#include <CL/cl.h>
			
 
				+#include <CL/cl_platform.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/******************************************************************************
			
 
				+/* cl_khr_dx9_media_sharing                                                   */
			
 
				+#define cl_khr_dx9_media_sharing 1
			
 
				+
			
 
				+typedef cl_uint             cl_dx9_media_adapter_type_khr;
			
 
				+typedef cl_uint             cl_dx9_media_adapter_set_khr;
			
 
				+    
			
 
				+#if defined(_WIN32)
			
 
				+#include <d3d9.h>
			
 
				+typedef struct _cl_dx9_surface_info_khr
			
 
				+{
			
 
				+    IDirect3DSurface9 *resource;
			
 
				+    HANDLE shared_handle;
			
 
				+} cl_dx9_surface_info_khr;
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+// Error Codes
			
 
				+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
			
 
				+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
			
 
				+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
			
 
				+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
			
 
				+
			
 
				+// cl_media_adapter_type_khr
			
 
				+#define CL_ADAPTER_D3D9_KHR                              0x2020
			
 
				+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
			
 
				+#define CL_ADAPTER_DXVA_KHR                              0x2022
			
 
				+
			
 
				+// cl_media_adapter_set_khr
			
 
				+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
			
 
				+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
			
 
				+
			
 
				+// cl_context_info
			
 
				+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
			
 
				+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
			
 
				+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
			
 
				+
			
 
				+// cl_mem_info
			
 
				+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
			
 
				+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
			
 
				+
			
 
				+// cl_image_info
			
 
				+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
			
 
				+
			
 
				+// cl_command_type
			
 
				+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
			
 
				+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
			
 
				+
			
 
				+/******************************************************************************/
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
			
 
				+    cl_platform_id                   platform,
			
 
				+    cl_uint                          num_media_adapters,
			
 
				+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
			
 
				+    void *                           media_adapters,
			
 
				+    cl_dx9_media_adapter_set_khr     media_adapter_set,
			
 
				+    cl_uint                          num_entries,
			
 
				+    cl_device_id *                   devices,
			
 
				+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
			
 
				+    cl_context                    context,
			
 
				+    cl_mem_flags                  flags,
			
 
				+    cl_dx9_media_adapter_type_khr adapter_type,
			
 
				+    void *                        surface_info,
			
 
				+    cl_uint                       plane,                                                                          
			
 
				+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    const cl_mem *   mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
			
 
				+    cl_command_queue command_queue,
			
 
				+    cl_uint          num_objects,
			
 
				+    cl_mem *         mem_objects,
			
 
				+    cl_uint          num_events_in_wait_list,
			
 
				+    const cl_event * event_wait_list,
			
 
				+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  // __OPENCL_CL_DX9_MEDIA_SHARING_H
			
 
				+
			
--- a/socl/src/CL/cl_ext.h
+++ b/socl/src/CL/cl_ext.h
@@ -0,0 +1,213 @@
 
				+/*******************************************************************************
			
 
				+ * Copyright (c) 2008-2010 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
			
 
				+
			
 
				+/* cl_ext.h contains OpenCL extensions which don't have external */
			
 
				+/* (OpenGL, D3D) dependencies.                                   */
			
 
				+
			
 
				+#ifndef __CL_EXT_H
			
 
				+#define __CL_EXT_H
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __APPLE__
			
 
				+	#include <OpenCL/cl.h>
			
 
				+    #include <AvailabilityMacros.h>
			
 
				+#else
			
 
				+	#include <CL/cl.h>
			
 
				+#endif
			
 
				+
			
 
				+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
			
 
				+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
			
 
				+
			
 
				+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
			
 
				+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
			
 
				+
			
 
				+/* Memory object destruction
			
 
				+ *
			
 
				+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
			
 
				+ *
			
 
				+ * Registers a user callback function that will be called when the memory object is deleted and its resources 
			
 
				+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
			
 
				+ * stack associated with memobj. The registered user callback functions are called in the reverse order in 
			
 
				+ * which they were registered. The user callback functions are called and then the memory object is deleted 
			
 
				+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
			
 
				+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
			
 
				+ * the storage bits for the memory object, can be reused or freed.
			
 
				+ *
			
 
				+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
			
 
				+ *
			
 
				+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
			
 
				+ * before using.
			
 
				+ */
			
 
				+#define cl_APPLE_SetMemObjectDestructor 1
			
 
				+cl_int	CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
			
 
				+                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
			
 
				+                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
			
 
				+
			
 
				+
			
 
				+/* Context Logging Functions
			
 
				+ *
			
 
				+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
			
 
				+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
			
 
				+ * before using.
			
 
				+ *
			
 
				+ * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
			
 
				+ */
			
 
				+#define cl_APPLE_ContextLoggingFunctions 1
			
 
				+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
			
 
				+                                            const void * /* private_info */, 
			
 
				+                                            size_t       /* cb */, 
			
 
				+                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
			
 
				+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
			
 
				+                                          const void * /* private_info */, 
			
 
				+                                          size_t       /* cb */, 
			
 
				+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
			
 
				+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
			
 
				+                                          const void * /* private_info */, 
			
 
				+                                          size_t       /* cb */, 
			
 
				+                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+
			
 
				+/************************ 
			
 
				+* cl_khr_icd extension *                                                  
			
 
				+************************/
			
 
				+#define cl_khr_icd 1
			
 
				+
			
 
				+/* cl_platform_info                                                        */
			
 
				+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
			
 
				+
			
 
				+/* Additional Error Codes                                                  */
			
 
				+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
			
 
				+
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
			
 
				+                       cl_platform_id * /* platforms */,
			
 
				+                       cl_uint *        /* num_platforms */);
			
 
				+
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
			
 
				+    cl_uint          /* num_entries */,
			
 
				+    cl_platform_id * /* platforms */,
			
 
				+    cl_uint *        /* num_platforms */);
			
 
				+
			
 
				+
			
 
				+/******************************************
			
 
				+* cl_nv_device_attribute_query extension *
			
 
				+******************************************/
			
 
				+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
			
 
				+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
			
 
				+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
			
 
				+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
			
 
				+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
			
 
				+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
			
 
				+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
			
 
				+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
			
 
				+
			
 
				+
			
 
				+/*********************************
			
 
				+* cl_amd_device_attribute_query *
			
 
				+*********************************/
			
 
				+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
			
 
				+
			
 
				+
			
 
				+#ifdef CL_VERSION_1_1
			
 
				+   /***********************************
			
 
				+    * cl_ext_device_fission extension *
			
 
				+    ***********************************/
			
 
				+    #define cl_ext_device_fission   1
			
 
				+    
			
 
				+    extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
			
 
				+    
			
 
				+    typedef CL_API_ENTRY cl_int 
			
 
				+    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+    extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
			
 
				+    
			
 
				+    typedef CL_API_ENTRY cl_int 
			
 
				+    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+    typedef cl_ulong  cl_device_partition_property_ext;
			
 
				+    extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
			
 
				+                            const cl_device_partition_property_ext * /* properties */,
			
 
				+                            cl_uint /*num_entries*/,
			
 
				+                            cl_device_id * /*out_devices*/,
			
 
				+                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+    typedef CL_API_ENTRY cl_int 
			
 
				+    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
			
 
				+                                                const cl_device_partition_property_ext * /* properties */,
			
 
				+                                                cl_uint /*num_entries*/,
			
 
				+                                                cl_device_id * /*out_devices*/,
			
 
				+                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+    /* cl_device_partition_property_ext */
			
 
				+    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
			
 
				+    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
			
 
				+    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
			
 
				+    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
			
 
				+    
			
 
				+    /* clDeviceGetInfo selectors */
			
 
				+    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
			
 
				+    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
			
 
				+    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
			
 
				+    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
			
 
				+    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
			
 
				+    
			
 
				+    /* error codes */
			
 
				+    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
			
 
				+    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
			
 
				+    #define CL_INVALID_PARTITION_NAME_EXT               -1059
			
 
				+    
			
 
				+    /* CL_AFFINITY_DOMAINs */
			
 
				+    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
			
 
				+    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
			
 
				+    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
			
 
				+    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
			
 
				+    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
			
 
				+    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
			
 
				+    
			
 
				+    /* cl_device_partition_property_ext list terminators */
			
 
				+    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
			
 
				+    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
			
 
				+    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
			
 
				+
			
 
				+
			
 
				+
			
 
				+#endif /* CL_VERSION_1_1 */
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#endif /* __CL_EXT_H */
			
--- a/socl/src/CL/cl_gl.h
+++ b/socl/src/CL/cl_gl.h
@@ -0,0 +1,163 @@
 
				+/**********************************************************************************
			
 
				+ * Copyright (c) 2011 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ **********************************************************************************/
			
 
				+
			
 
				+#ifndef __OPENCL_CL_GL_H
			
 
				+#define __OPENCL_CL_GL_H
			
 
				+
			
 
				+#ifdef __APPLE__
			
 
				+#include <OpenCL/cl.h>
			
 
				+#else
			
 
				+#include <CL/cl.h>
			
 
				+#endif	
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef cl_uint     cl_gl_object_type;
			
 
				+typedef cl_uint     cl_gl_texture_info;
			
 
				+typedef cl_uint     cl_gl_platform_info;
			
 
				+typedef struct __GLsync *cl_GLsync;
			
 
				+
			
 
				+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
			
 
				+#define CL_GL_OBJECT_BUFFER                     0x2000
			
 
				+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
			
 
				+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
			
 
				+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
			
 
				+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
			
 
				+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
			
 
				+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
			
 
				+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
			
 
				+
			
 
				+/* cl_gl_texture_info           */
			
 
				+#define CL_GL_TEXTURE_TARGET                    0x2004
			
 
				+#define CL_GL_MIPMAP_LEVEL                      0x2005
			
 
				+
			
 
				+
			
 
				+extern CL_API_ENTRY cl_mem CL_API_CALL
			
 
				+clCreateFromGLBuffer(cl_context     /* context */,
			
 
				+                     cl_mem_flags   /* flags */,
			
 
				+                     cl_GLuint      /* bufobj */,
			
 
				+                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+extern CL_API_ENTRY cl_mem CL_API_CALL
			
 
				+clCreateFromGLTexture(cl_context      /* context */,
			
 
				+                      cl_mem_flags    /* flags */,
			
 
				+                      cl_GLenum       /* target */,
			
 
				+                      cl_GLint        /* miplevel */,
			
 
				+                      cl_GLuint       /* texture */,
			
 
				+                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
			
 
				+    
			
 
				+extern CL_API_ENTRY cl_mem CL_API_CALL
			
 
				+clCreateFromGLRenderbuffer(cl_context   /* context */,
			
 
				+                           cl_mem_flags /* flags */,
			
 
				+                           cl_GLuint    /* renderbuffer */,
			
 
				+                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clGetGLObjectInfo(cl_mem                /* memobj */,
			
 
				+                  cl_gl_object_type *   /* gl_object_type */,
			
 
				+                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+                  
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clGetGLTextureInfo(cl_mem               /* memobj */,
			
 
				+                   cl_gl_texture_info   /* param_name */,
			
 
				+                   size_t               /* param_value_size */,
			
 
				+                   void *               /* param_value */,
			
 
				+                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
			
 
				+                          cl_uint               /* num_objects */,
			
 
				+                          const cl_mem *        /* mem_objects */,
			
 
				+                          cl_uint               /* num_events_in_wait_list */,
			
 
				+                          const cl_event *      /* event_wait_list */,
			
 
				+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
			
 
				+                          cl_uint               /* num_objects */,
			
 
				+                          const cl_mem *        /* mem_objects */,
			
 
				+                          cl_uint               /* num_events_in_wait_list */,
			
 
				+                          const cl_event *      /* event_wait_list */,
			
 
				+                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+
			
 
				+#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
			
 
				+#warning CL_USE_DEPRECATED_OPENCL_1_1_APIS is defined. These APIs are unsupported and untested in OpenCL 1.2!
			
 
				+    extern CL_API_ENTRY cl_mem CL_API_CALL
			
 
				+    clCreateFromGLTexture2D(cl_context      /* context */,
			
 
				+                            cl_mem_flags    /* flags */,
			
 
				+                            cl_GLenum       /* target */,
			
 
				+                            cl_GLint        /* miplevel */,
			
 
				+                            cl_GLuint       /* texture */,
			
 
				+                            cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
			
 
				+    
			
 
				+    extern CL_API_ENTRY cl_mem CL_API_CALL
			
 
				+    clCreateFromGLTexture3D(cl_context      /* context */,
			
 
				+                            cl_mem_flags    /* flags */,
			
 
				+                            cl_GLenum       /* target */,
			
 
				+                            cl_GLint        /* miplevel */,
			
 
				+                            cl_GLuint       /* texture */,
			
 
				+                            cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
			
 
				+#endif /* CL_USE_DEPRECATED_OPENCL_1_2_APIS */
			
 
				+    
			
 
				+/* cl_khr_gl_sharing extension  */
			
 
				+    
			
 
				+#define cl_khr_gl_sharing 1
			
 
				+    
			
 
				+typedef cl_uint     cl_gl_context_info;
			
 
				+    
			
 
				+/* Additional Error Codes  */
			
 
				+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
			
 
				+    
			
 
				+/* cl_gl_context_info  */
			
 
				+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
			
 
				+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
			
 
				+    
			
 
				+/* Additional cl_context_properties  */
			
 
				+#define CL_GL_CONTEXT_KHR                       0x2008
			
 
				+#define CL_EGL_DISPLAY_KHR                      0x2009
			
 
				+#define CL_GLX_DISPLAY_KHR                      0x200A
			
 
				+#define CL_WGL_HDC_KHR                          0x200B
			
 
				+#define CL_CGL_SHAREGROUP_KHR                   0x200C
			
 
				+    
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
			
 
				+                      cl_gl_context_info            /* param_name */,
			
 
				+                      size_t                        /* param_value_size */,
			
 
				+                      void *                        /* param_value */,
			
 
				+                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
			
 
				+    
			
 
				+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
			
 
				+    const cl_context_properties * properties,
			
 
				+    cl_gl_context_info            param_name,
			
 
				+    size_t                        param_value_size,
			
 
				+    void *                        param_value,
			
 
				+    size_t *                      param_value_size_ret);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  /* __OPENCL_CL_GL_H */
			
--- a/socl/src/CL/cl_gl_ext.h
+++ b/socl/src/CL/cl_gl_ext.h
@@ -0,0 +1,69 @@
 
				+/**********************************************************************************
			
 
				+ * Copyright (c) 2008-2010 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ **********************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
			
 
				+
			
 
				+/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
			
 
				+/* OpenGL dependencies.                                                         */
			
 
				+
			
 
				+#ifndef __OPENCL_CL_GL_EXT_H
			
 
				+#define __OPENCL_CL_GL_EXT_H
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __APPLE__
			
 
				+    #include <OpenCL/cl_gl.h>
			
 
				+#else
			
 
				+    #include <CL/cl_gl.h>
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * For each extension, follow this template
			
 
				+ *  cl_VEN_extname extension  */
			
 
				+/* #define cl_VEN_extname 1
			
 
				+ * ... define new types, if any
			
 
				+ * ... define new tokens, if any
			
 
				+ * ... define new APIs, if any
			
 
				+ *
			
 
				+ *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
			
 
				+ *  This allows us to avoid having to decide whether to include GL headers or GLES here.
			
 
				+ */
			
 
				+
			
 
				+/* 
			
 
				+ *  cl_khr_gl_event  extension
			
 
				+ *  See section 9.9 in the OpenCL 1.1 spec for more information
			
 
				+ */
			
 
				+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
			
 
				+
			
 
				+extern CL_API_ENTRY cl_event CL_API_CALL
			
 
				+clCreateEventFromGLsyncKHR(cl_context           /* context */,
			
 
				+                           cl_GLsync            /* cl_GLsync */,
			
 
				+                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif	/* __OPENCL_CL_GL_EXT_H  */
			
--- a/socl/src/CL/cl_platform.h
+++ b/socl/src/CL/cl_platform.h
--- a/socl/src/CL/opencl.h
+++ b/socl/src/CL/opencl.h
@@ -0,0 +1,54 @@
 
				+/*******************************************************************************
			
 
				+ * Copyright (c) 2008-2010 The Khronos Group Inc.
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a
			
 
				+ * copy of this software and/or associated documentation files (the
			
 
				+ * "Materials"), to deal in the Materials without restriction, including
			
 
				+ * without limitation the rights to use, copy, modify, merge, publish,
			
 
				+ * distribute, sublicense, and/or sell copies of the Materials, and to
			
 
				+ * permit persons to whom the Materials are furnished to do so, subject to
			
 
				+ * the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included
			
 
				+ * in all copies or substantial portions of the Materials.
			
 
				+ *
			
 
				+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
			
 
				+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
			
 
				+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
			
 
				+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
			
 
				+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
			
 
				+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
			
 
				+ ******************************************************************************/
			
 
				+
			
 
				+/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
			
 
				+
			
 
				+#ifndef __OPENCL_H
			
 
				+#define __OPENCL_H
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __APPLE__
			
 
				+
			
 
				+#include <OpenCL/cl.h>
			
 
				+#include <OpenCL/cl_gl.h>
			
 
				+#include <OpenCL/cl_gl_ext.h>
			
 
				+#include <OpenCL/cl_ext.h>
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#include <CL/cl.h>
			
 
				+#include <CL/cl_gl.h>
			
 
				+#include <CL/cl_gl_ext.h>
			
 
				+#include <CL/cl_ext.h>
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif  /* __OPENCL_H   */
			
 
				+
			
--- a/socl/src/Makefile.am
+++ b/socl/src/Makefile.am
@@ -33,9 +33,20 @@ noinst_HEADERS =				\
 
				   gc.h						\
			
 
				   getinfo.h					\
			
 
				   mem_objects.h					\
			
 
				+  ocl_icd.h					\
			
 
				   socl.h					\
			
 
				   task.h					\
			
 
				-  util.h
			
 
				+  util.h					\
			
 
				+  init.h					\
			
 
				+  CL/cl_d3d10.h					\
			
 
				+  CL/cl_ext.h					\
			
 
				+  CL/cl.h					\
			
 
				+  CL/cl_d3d11.h					\
			
 
				+  CL/cl_gl_ext.h				\
			
 
				+  CL/cl_platform.h				\
			
 
				+  CL/cl_dx9_media_sharing.h			\
			
 
				+  CL/cl_gl.h					\
			
 
				+  CL/opencl.h
			
 
				 
			
 
				 libsocl_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined			\
			
 
				   -version-info $(LIBSOCL_INTERFACE_CURRENT):$(LIBSOCL_INTERFACE_REVISION):$(LIBSOCL_INTERFACE_AGE)
			
@@ -118,7 +129,8 @@ libsocl_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 
				   cl_enqueuendrangekernel.c \
			
 
				   cl_enqueuenativekernel.c \
			
 
				   cl_geteventprofilinginfo.c \
			
 
				-  cl_getextensionfunctionaddress.c
			
 
				+  cl_getextensionfunctionaddress.c \
			
 
				+  cl_icdgetplatformidskhr.c
			
 
				 
			
 
				 
			
 
				 
			
--- a/socl/src/cl_createbuffer.c
+++ b/socl/src/cl_createbuffer.c
@@ -26,7 +26,7 @@ static void release_callback_memobject(void * e) {
 
				   mem_object_release(mem);
			
 
				 
			
 
				   /* Destruct object */
			
 
				-  starpu_data_unregister_lazy(mem->handle);
			
 
				+  starpu_data_unregister_submit(mem->handle);
			
 
				 
			
 
				   if (!(mem->flags & CL_MEM_USE_HOST_PTR))
			
 
				     free(mem->ptr);
			
--- a/socl/src/cl_createcontextfromtype.c
+++ b/socl/src/cl_createcontextfromtype.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,6 +17,7 @@
 
				  */
			
 
				 
			
 
				 #include "socl.h"
			
 
				+#include "init.h"
			
 
				 
			
 
				 CL_API_ENTRY cl_context CL_API_CALL
			
 
				 soclCreateContextFromType(const cl_context_properties * properties,
			
@@ -23,6 +26,8 @@ soclCreateContextFromType(const cl_context_properties * properties,
 
				                         void *                        user_data,
			
 
				                         cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				+   if( ! _starpu_init )
			
 
				+      socl_init_starpu(); 
			
 
				    //We assume clCreateContext doesn't support devices
			
 
				    //TODO:use devices
			
 
				    return soclCreateContext(properties, 0, NULL, pfn_notify, user_data, errcode_ret);
			
--- a/socl/src/cl_getdeviceids.c
+++ b/socl/src/cl_getdeviceids.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,7 +17,7 @@
 
				  */
			
 
				 
			
 
				 #include "socl.h"
			
 
				-
			
 
				+#include "init.h"
			
 
				 
			
 
				 /**
			
 
				  * \brief Return one device of each kind
			
@@ -24,11 +26,19 @@
 
				  */
			
 
				 CL_API_ENTRY cl_int CL_API_CALL
			
 
				 soclGetDeviceIDs(cl_platform_id   platform,
			
 
				-               cl_device_type   device_type, 
			
 
				-               cl_uint          num_entries, 
			
 
				-               cl_device_id *   devices, 
			
 
				+               cl_device_type   device_type,
			
 
				+               cl_uint          num_entries,
			
 
				+               cl_device_id *   devices,
			
 
				                cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0
			
 
				 {
			
 
				+   if( ! _starpu_init )
			
 
				+      socl_init_starpu();
			
 
				+
			
 
				+   if (_starpu_init_failed) {
			
 
				+	*num_devices = 0;
			
 
				+	return CL_SUCCESS;
			
 
				+   }
			
 
				+
			
 
				    if (platform != NULL && platform != &socl_platform)
			
 
				       return CL_INVALID_PLATFORM;
			
 
				 
			
--- a/socl/src/cl_getextensionfunctionaddress.c
+++ b/socl/src/cl_getextensionfunctionaddress.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,6 +16,7 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <string.h>
			
 
				 #include "socl.h"
			
 
				 
			
 
				 CL_API_ENTRY void * CL_API_CALL
			
@@ -22,3 +25,10 @@ soclGetExtensionFunctionAddress(const char * UNUSED(func_name)) CL_API_SUFFIX__V
 
				    //TODO
			
 
				    return NULL;
			
 
				 }
			
 
				+
			
 
				+CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddress(
			
 
				+             const char *   func_name) CL_API_SUFFIX__VERSION_1_0 {
			
 
				+  if( func_name != NULL &&  strcmp("clIcdGetPlatformIDsKHR", func_name) == 0 )
			
 
				+    return (void *)soclIcdGetPlatformIDsKHR;
			
 
				+  return NULL;
			
 
				+}
			
--- a/socl/src/cl_getplatforminfo.c
+++ b/socl/src/cl_getplatforminfo.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -39,6 +41,7 @@ soclGetPlatformInfo(cl_platform_id   platform,
 
				 	INFO_CASE_STRING(CL_PLATFORM_NAME,    SOCL_PLATFORM_NAME);
			
 
				 	INFO_CASE_STRING(CL_PLATFORM_VENDOR,  SOCL_VENDOR);
			
 
				 	INFO_CASE_STRING(CL_PLATFORM_EXTENSIONS, SOCL_PLATFORM_EXTENSIONS);
			
 
				+	INFO_CASE_STRING(CL_PLATFORM_ICD_SUFFIX_KHR, SOCL_PLATFORM_ICD_SUFFIX_KHR);
			
 
				    default:
			
 
				          return CL_INVALID_VALUE;
			
 
				    }
			
--- a/socl/src/cl_icdgetplatformidskhr.c
+++ b/socl/src/cl_icdgetplatformidskhr.c
@@ -0,0 +1,40 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include "socl.h"
			
 
				+
			
 
				+extern int _starpu_init_failed;
			
 
				+
			
 
				+CL_API_ENTRY cl_int CL_API_CALL soclIcdGetPlatformIDsKHR(
			
 
				+             cl_uint num_entries,
			
 
				+             cl_platform_id *platforms,
			
 
				+             cl_uint *num_platforms) CL_EXT_SUFFIX__VERSION_1_0{
			
 
				+     if ((num_entries == 0 && platforms != NULL)
			
 
				+	 || (num_platforms == NULL && platforms == NULL))
			
 
				+	  return CL_INVALID_VALUE;
			
 
				+
			
 
				+     else {
			
 
				+	  if (platforms != NULL)
			
 
				+	       platforms[0] = &socl_platform;
			
 
				+
			
 
				+	  if (num_platforms != NULL)
			
 
				+	       *num_platforms = 1;
			
 
				+     }
			
 
				+
			
 
				+   return CL_SUCCESS;
			
 
				+}
			
--- a/socl/src/command_queue.c
+++ b/socl/src/command_queue.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -144,7 +146,8 @@ void command_queue_enqueue_ex(cl_command_queue cq, cl_command cmd, cl_uint num_e
 
				 	int is_barrier = 0;
			
 
				 	if (cmd->typ == CL_COMMAND_BARRIER) {
			
 
				 		is_barrier = 1;
			
 
				-		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER */
			
 
				+		/* OpenCL has no CL_COMMAND_BARRIER type, so we fall back on CL_COMMAND_MARKER 
			
 
				+                   WARNING OpenCL has CL_COMMAND_BARRIER in 1.2*/
			
 
				 		cmd->typ = CL_COMMAND_MARKER;
			
 
				 	}
			
 
				 
			
--- a/socl/src/devices.c
+++ b/socl/src/devices.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -24,6 +26,7 @@ const cl_uint __attribute__ ((aligned (16))) SOCL_DEVICE_VENDOR_ID = 666;
 
				 
			
 
				 const struct _cl_device_id socl_devices[] = {
			
 
				    { 
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_CPU,
			
 
				       .max_compute_units = 1,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -56,6 +59,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -88,6 +92,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -120,6 +125,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -152,6 +158,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -184,6 +191,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -216,6 +224,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
@@ -248,6 +257,7 @@ const struct _cl_device_id socl_devices[] = {
 
				       .extensions = ""
			
 
				    },
			
 
				    {
			
 
				+      .dispatch = &socl_master_dispatch,
			
 
				       .type = CL_DEVICE_TYPE_GPU,
			
 
				       .max_compute_units = 12,
			
 
				       .max_work_item_dimensions = 3,
			
--- a/socl/src/devices.h
+++ b/socl/src/devices.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,13 +17,14 @@
 
				 
			
 
				 #ifndef SOCL_DEVICES_H
			
 
				 #define SOCL_DEVICES_H
			
 
				-
			
 
				+#include "socl.h"
			
 
				 // OpenCL 1.0 : Mandatory format: major_number.minor_number
			
 
				 const char * SOCL_DRIVER_VERSION;
			
 
				 
			
 
				 const cl_uint SOCL_DEVICE_VENDOR_ID;
			
 
				 
			
 
				 struct _cl_device_id {
			
 
				+   struct _cl_icd_dispatch * dispatch;
			
 
				    cl_device_type    type;
			
 
				    cl_uint           max_compute_units;        //OpenCL 1.0: minimum value is 1
			
 
				    cl_uint           max_work_item_dimensions; //OpenCL 1.0: minimum value is 3
			
--- a/socl/src/gc.c
+++ b/socl/src/gc.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -141,6 +143,7 @@ int gc_entity_release_ex(entity e) {
 
				 void gc_entity_init(void *arg, void (*release_callback)(void*)) {
			
 
				   struct entity * e = (entity)arg;
			
 
				 
			
 
				+  e->dispatch = &socl_master_dispatch;
			
 
				   e->refs = 1;
			
 
				   e->release_callback = release_callback;
			
 
				   e->prev = NULL;
			
--- a/socl/src/init.c
+++ b/socl/src/init.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -14,29 +16,29 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#include <pthread.h>
			
 
				 #include "socl.h"
			
 
				 #include "gc.h"
			
 
				 #include "mem_objects.h"
			
 
				 
			
 
				 int _starpu_init_failed;
			
 
				+int _starpu_init = 0;
			
 
				+pthread_mutex_t _socl_mutex = PTHREAD_MUTEX_INITIALIZER;
			
 
				 
			
 
				-/**
			
 
				- * Initialize SOCL
			
 
				- */
			
 
				-__attribute__((constructor)) static void socl_init() {
			
 
				+void socl_init_starpu(void) {
			
 
				+  pthread_mutex_lock(&_socl_mutex);
			
 
				+  if( ! _starpu_init ){
			
 
				+    struct starpu_conf conf;
			
 
				+    starpu_conf_init(&conf);
			
 
				+    conf.ncuda = 0;
			
 
				 
			
 
				-  struct starpu_conf conf;
			
 
				-  starpu_conf_init(&conf);
			
 
				-  conf.ncuda = 0;
			
 
				 
			
 
				-  mem_object_init();
			
 
				-
			
 
				-  _starpu_init_failed = starpu_init(&conf);
			
 
				-  if (_starpu_init_failed != 0)
			
 
				-  {
			
 
				+    _starpu_init_failed = starpu_init(&conf);
			
 
				+    if (_starpu_init_failed != 0)
			
 
				+    {
			
 
				        DEBUG_MSG("Error when calling starpu_init: %d\n", _starpu_init_failed);
			
 
				-  }
			
 
				-  else {
			
 
				+    }
			
 
				+    else {
			
 
				        if (starpu_cpu_worker_get_count() == 0)
			
 
				        {
			
 
				 	    DEBUG_MSG("StarPU did not find any CPU device. SOCL needs at least 1 CPU.\n");
			
@@ -47,10 +49,22 @@ __attribute__((constructor)) static void socl_init() {
 
				 	    DEBUG_MSG("StarPU didn't find any OpenCL device. Try disabling CUDA support in StarPU (export STARPU_NCUDA=0).\n");
			
 
				 	    _starpu_init_failed = -ENODEV;
			
 
				        }
			
 
				+    }
			
 
				+
			
 
				+    /* Disable dataflow implicit dependencies */
			
 
				+    starpu_data_set_default_sequential_consistency_flag(0);
			
 
				+    _starpu_init = 1;
			
 
				   }
			
 
				+  pthread_mutex_unlock(&_socl_mutex);
			
 
				 
			
 
				-  /* Disable dataflow implicit dependencies */
			
 
				-  starpu_data_set_default_sequential_consistency_flag(0);
			
 
				+}
			
 
				+/**
			
 
				+ * Initialize SOCL
			
 
				+ */
			
 
				+__attribute__((constructor)) static void socl_init() {
			
 
				+
			
 
				+
			
 
				+  mem_object_init();
			
 
				 
			
 
				   gc_start();
			
 
				 }
			
@@ -59,17 +73,21 @@ __attribute__((constructor)) static void socl_init() {
 
				  * Shutdown SOCL
			
 
				  */
			
 
				 __attribute__((destructor)) static void socl_shutdown() {
			
 
				-
			
 
				-  starpu_task_wait_for_all();
			
 
				+  pthread_mutex_lock(&_socl_mutex);
			
 
				+  if( _starpu_init )
			
 
				+    starpu_task_wait_for_all();
			
 
				 
			
 
				   gc_stop();
			
 
				 
			
 
				-  starpu_task_wait_for_all();
			
 
				+  if( _starpu_init )
			
 
				+    starpu_task_wait_for_all();
			
 
				 
			
 
				   int active_entities = gc_active_entity_count();
			
 
				 
			
 
				   if (active_entities != 0)
			
 
				     DEBUG_MSG("Unreleased entities: %d\n", active_entities);
			
 
				 
			
 
				-  starpu_shutdown();
			
 
				+  if( _starpu_init )
			
 
				+    starpu_shutdown();
			
 
				+  pthread_mutex_unlock(&_socl_mutex);
			
 
				 }
			
--- a/mpi/starpu_mpi_insert_task_cache.h
+++ b/mpi/starpu_mpi_insert_task_cache.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -15,13 +15,15 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				-#include <starpu.h>
			
 
				-#include <common/list.h>
			
 
				+#include <pthread.h>
			
 
				+#include "socl.h"
			
 
				+#include "gc.h"
			
 
				+#include "mem_objects.h"
			
 
				 
			
 
				-#define _STARPU_MPI_CLEAR_SENT_DATA     0
			
 
				-#define _STARPU_MPI_CLEAR_RECEIVED_DATA 1
			
 
				-
			
 
				-extern struct starpu_htbl64_node **sent_data;
			
 
				-extern struct starpu_htbl64_node **received_data;
			
 
				+extern int _starpu_init_failed;
			
 
				+extern volatile int _starpu_init;
			
 
				+/**
			
 
				+ * Initialize StarPU
			
 
				+ */
			
 
				 
			
 
				-void _starpu_mpi_clear_cache_request(starpu_data_handle_t data_handle, int rank, int mode);
			
 
				+void socl_init_starpu(void);
			
--- a/socl/src/ocl_icd.h
+++ b/socl/src/ocl_icd.h
@@ -0,0 +1,905 @@
 
				+/**
			
 
				+Copyright (c) 2012, Brice Videau <brice.videau@imag.fr>
			
 
				+Copyright (c) 2012, Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				+All rights reserved.
			
 
				+      
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are met:
			
 
				+    
			
 
				+1. Redistributions of source code must retain the above copyright notice, this
			
 
				+   list of conditions and the following disclaimer.
			
 
				+2. Redistributions in binary form must reproduce the above copyright notice,
			
 
				+   this list of conditions and the following disclaimer in the documentation
			
 
				+   and/or other materials provided with the distribution.
			
 
				+        
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
			
 
				+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
			
 
				+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
			
 
				+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
			
 
				+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
			
 
				+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
			
 
				+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
			
 
				+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
			
 
				+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+Do not edit this file. It is automatically generated.
			
 
				+
			
 
				+*/
			
 
				+
			
 
				+#include "CL/cl.h"
			
 
				+#include "CL/cl_gl.h"
			
 
				+#include "CL/cl_ext.h"
			
 
				+
			
 
				+#define OCL_ICD_API_VERSION	1
			
 
				+#define OCL_ICD_IDENTIFIED_FUNCTIONS	102
			
 
				+
			
 
				+struct _cl_icd_dispatch {
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetPlatformIDs)(
			
 
				+    cl_uint          /* num_entries */,
			
 
				+    cl_platform_id * /* platforms */,
			
 
				+    cl_uint *        /* num_platforms */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*
			
 
				+  clGetPlatformInfo)(
			
 
				+    cl_platform_id   /* platform */,
			
 
				+    cl_platform_info /* param_name */,
			
 
				+    size_t           /* param_value_size */,
			
 
				+    void *           /* param_value */,
			
 
				+    size_t *         /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetDeviceIDs)(
			
 
				+    cl_platform_id   /* platform */,
			
 
				+    cl_device_type   /* device_type */,
			
 
				+    cl_uint          /* num_entries */,
			
 
				+    cl_device_id *   /* devices */,
			
 
				+    cl_uint *        /* num_devices */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetDeviceInfo)(
			
 
				+    cl_device_id    /* device */,
			
 
				+    cl_device_info  /* param_name */,
			
 
				+    size_t          /* param_value_size */,
			
 
				+    void *          /* param_value */,
			
 
				+    size_t *        /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_context (CL_API_CALL*clCreateContext)(
			
 
				+    const cl_context_properties * /* properties */,
			
 
				+    cl_uint                       /* num_devices */,
			
 
				+    const cl_device_id *          /* devices */,
			
 
				+    void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
			
 
				+    void *                        /* user_data */,
			
 
				+    cl_int *                      /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_context (CL_API_CALL*clCreateContextFromType)(
			
 
				+    const cl_context_properties * /* properties */,
			
 
				+    cl_device_type                /* device_type */,
			
 
				+    void (CL_CALLBACK *     /* pfn_notify*/ )(const char *, const void *, size_t, void *),
			
 
				+    void *                        /* user_data */,
			
 
				+    cl_int *                      /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainContext)(
			
 
				+    cl_context /* context */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseContext)(
			
 
				+    cl_context /* context */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetContextInfo)(
			
 
				+    cl_context         /* context */,
			
 
				+    cl_context_info    /* param_name */,
			
 
				+    size_t             /* param_value_size */,
			
 
				+    void *             /* param_value */,
			
 
				+    size_t *           /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_command_queue (CL_API_CALL*clCreateCommandQueue)(
			
 
				+    cl_context                     /* context */,
			
 
				+    cl_device_id                   /* device */,
			
 
				+    cl_command_queue_properties    /* properties */,
			
 
				+    cl_int *                       /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainCommandQueue)(
			
 
				+    cl_command_queue /* command_queue */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseCommandQueue)(
			
 
				+    cl_command_queue /* command_queue */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetCommandQueueInfo)(
			
 
				+    cl_command_queue      /* command_queue */,
			
 
				+    cl_command_queue_info /* param_name */,
			
 
				+    size_t                /* param_value_size */,
			
 
				+    void *                /* param_value */,
			
 
				+    size_t *              /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clSetCommandQueueProperty)(
			
 
				+    cl_command_queue              /* command_queue */,
			
 
				+    cl_command_queue_properties   /* properties */,
			
 
				+    cl_bool                        /* enable */,
			
 
				+    cl_command_queue_properties * /* old_properties */) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateBuffer)(
			
 
				+    cl_context   /* context */,
			
 
				+    cl_mem_flags /* flags */,
			
 
				+    size_t       /* size */,
			
 
				+    void *       /* host_ptr */,
			
 
				+    cl_int *     /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateImage2D)(
			
 
				+    cl_context              /* context */,
			
 
				+    cl_mem_flags            /* flags */,
			
 
				+    const cl_image_format * /* image_format */,
			
 
				+    size_t                  /* image_width */,
			
 
				+    size_t                  /* image_height */,
			
 
				+    size_t                  /* image_row_pitch */,
			
 
				+    void *                  /* host_ptr */,
			
 
				+    cl_int *                /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateImage3D)(
			
 
				+    cl_context              /* context */,
			
 
				+    cl_mem_flags            /* flags */,
			
 
				+    const cl_image_format * /* image_format */,
			
 
				+    size_t                  /* image_width */,
			
 
				+    size_t                  /* image_height */,
			
 
				+    size_t                  /* image_depth */,
			
 
				+    size_t                  /* image_row_pitch */,
			
 
				+    size_t                  /* image_slice_pitch */,
			
 
				+    void *                  /* host_ptr */,
			
 
				+    cl_int *                /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainMemObject)(
			
 
				+    cl_mem /* memobj */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseMemObject)(
			
 
				+    cl_mem /* memobj */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetSupportedImageFormats)(
			
 
				+    cl_context           /* context */,
			
 
				+    cl_mem_flags         /* flags */,
			
 
				+    cl_mem_object_type   /* image_type */,
			
 
				+    cl_uint              /* num_entries */,
			
 
				+    cl_image_format *    /* image_formats */,
			
 
				+    cl_uint *            /* num_image_formats */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetMemObjectInfo)(
			
 
				+    cl_mem           /* memobj */,
			
 
				+    cl_mem_info      /* param_name */,
			
 
				+    size_t           /* param_value_size */,
			
 
				+    void *           /* param_value */,
			
 
				+    size_t *         /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetImageInfo)(
			
 
				+    cl_mem           /* image */,
			
 
				+    cl_image_info    /* param_name */,
			
 
				+    size_t           /* param_value_size */,
			
 
				+    void *           /* param_value */,
			
 
				+    size_t *         /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_sampler (CL_API_CALL*clCreateSampler)(
			
 
				+    cl_context          /* context */,
			
 
				+    cl_bool             /* normalized_coords */,
			
 
				+    cl_addressing_mode  /* addressing_mode */,
			
 
				+    cl_filter_mode      /* filter_mode */,
			
 
				+    cl_int *            /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainSampler)(
			
 
				+    cl_sampler /* sampler */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseSampler)(
			
 
				+    cl_sampler /* sampler */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetSamplerInfo)(
			
 
				+    cl_sampler         /* sampler */,
			
 
				+    cl_sampler_info    /* param_name */,
			
 
				+    size_t             /* param_value_size */,
			
 
				+    void *             /* param_value */,
			
 
				+    size_t *           /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_program (CL_API_CALL*clCreateProgramWithSource)(
			
 
				+    cl_context        /* context */,
			
 
				+    cl_uint           /* count */,
			
 
				+    const char **     /* strings */,
			
 
				+    const size_t *    /* lengths */,
			
 
				+    cl_int *          /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_program (CL_API_CALL*clCreateProgramWithBinary)(
			
 
				+    cl_context                     /* context */,
			
 
				+    cl_uint                        /* num_devices */,
			
 
				+    const cl_device_id *           /* device_list */,
			
 
				+    const size_t *                 /* lengths */,
			
 
				+    const unsigned char **         /* binaries */,
			
 
				+    cl_int *                       /* binary_status */,
			
 
				+    cl_int *                       /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainProgram)(
			
 
				+    cl_program /* program */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseProgram)(
			
 
				+    cl_program /* program */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clBuildProgram)(
			
 
				+    cl_program           /* program */,
			
 
				+    cl_uint              /* num_devices */,
			
 
				+    const cl_device_id * /* device_list */,
			
 
				+    const char *         /* options */,
			
 
				+    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
			
 
				+    void *               /* user_data */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clUnloadCompiler)(
			
 
				+    void
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetProgramInfo)(
			
 
				+    cl_program         /* program */,
			
 
				+    cl_program_info    /* param_name */,
			
 
				+    size_t             /* param_value_size */,
			
 
				+    void *             /* param_value */,
			
 
				+    size_t *           /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetProgramBuildInfo)(
			
 
				+    cl_program            /* program */,
			
 
				+    cl_device_id          /* device */,
			
 
				+    cl_program_build_info /* param_name */,
			
 
				+    size_t                /* param_value_size */,
			
 
				+    void *                /* param_value */,
			
 
				+    size_t *              /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_kernel (CL_API_CALL*clCreateKernel)(
			
 
				+    cl_program      /* program */,
			
 
				+    const char *    /* kernel_name */,
			
 
				+    cl_int *        /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clCreateKernelsInProgram)(
			
 
				+    cl_program     /* program */,
			
 
				+    cl_uint        /* num_kernels */,
			
 
				+    cl_kernel *    /* kernels */,
			
 
				+    cl_uint *      /* num_kernels_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainKernel)(
			
 
				+    cl_kernel    /* kernel */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseKernel)(
			
 
				+    cl_kernel   /* kernel */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clSetKernelArg)(
			
 
				+    cl_kernel    /* kernel */,
			
 
				+    cl_uint      /* arg_index */,
			
 
				+    size_t       /* arg_size */,
			
 
				+    const void * /* arg_value */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetKernelInfo)(
			
 
				+    cl_kernel       /* kernel */,
			
 
				+    cl_kernel_info  /* param_name */,
			
 
				+    size_t          /* param_value_size */,
			
 
				+    void *          /* param_value */,
			
 
				+    size_t *        /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetKernelWorkGroupInfo)(
			
 
				+    cl_kernel                  /* kernel */,
			
 
				+    cl_device_id               /* device */,
			
 
				+    cl_kernel_work_group_info  /* param_name */,
			
 
				+    size_t                     /* param_value_size */,
			
 
				+    void *                     /* param_value */,
			
 
				+    size_t *                   /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clWaitForEvents)(
			
 
				+    cl_uint             /* num_events */,
			
 
				+    const cl_event *    /* event_list */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetEventInfo)(
			
 
				+    cl_event         /* event */,
			
 
				+    cl_event_info    /* param_name */,
			
 
				+    size_t           /* param_value_size */,
			
 
				+    void *           /* param_value */,
			
 
				+    size_t *         /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainEvent)(
			
 
				+    cl_event /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseEvent)(
			
 
				+    cl_event /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetEventProfilingInfo)(
			
 
				+    cl_event            /* event */,
			
 
				+    cl_profiling_info   /* param_name */,
			
 
				+    size_t              /* param_value_size */,
			
 
				+    void *              /* param_value */,
			
 
				+    size_t *            /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clFlush)(
			
 
				+    cl_command_queue /* command_queue */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clFinish)(
			
 
				+    cl_command_queue /* command_queue */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueReadBuffer)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* buffer */,
			
 
				+    cl_bool             /* blocking_read */,
			
 
				+    size_t              /* offset */,
			
 
				+    size_t              /* cb */,
			
 
				+    void *              /* ptr */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueWriteBuffer)(
			
 
				+    cl_command_queue   /* command_queue */,
			
 
				+    cl_mem             /* buffer */,
			
 
				+    cl_bool            /* blocking_write */,
			
 
				+    size_t             /* offset */,
			
 
				+    size_t             /* cb */,
			
 
				+    const void *       /* ptr */,
			
 
				+    cl_uint            /* num_events_in_wait_list */,
			
 
				+    const cl_event *   /* event_wait_list */,
			
 
				+    cl_event *         /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueCopyBuffer)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* src_buffer */,
			
 
				+    cl_mem              /* dst_buffer */,
			
 
				+    size_t              /* src_offset */,
			
 
				+    size_t              /* dst_offset */,
			
 
				+    size_t              /* cb */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueReadImage)(
			
 
				+    cl_command_queue     /* command_queue */,
			
 
				+    cl_mem               /* image */,
			
 
				+    cl_bool              /* blocking_read */,
			
 
				+    const size_t *       /* origin[3] */,
			
 
				+    const size_t *       /* region[3] */,
			
 
				+    size_t               /* row_pitch */,
			
 
				+    size_t               /* slice_pitch */,
			
 
				+    void *               /* ptr */,
			
 
				+    cl_uint              /* num_events_in_wait_list */,
			
 
				+    const cl_event *     /* event_wait_list */,
			
 
				+    cl_event *           /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueWriteImage)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* image */,
			
 
				+    cl_bool             /* blocking_write */,
			
 
				+    const size_t *      /* origin[3] */,
			
 
				+    const size_t *      /* region[3] */,
			
 
				+    size_t              /* input_row_pitch */,
			
 
				+    size_t              /* input_slice_pitch */,
			
 
				+    const void *        /* ptr */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueCopyImage)(
			
 
				+    cl_command_queue     /* command_queue */,
			
 
				+    cl_mem               /* src_image */,
			
 
				+    cl_mem               /* dst_image */,
			
 
				+    const size_t *       /* src_origin[3] */,
			
 
				+    const size_t *       /* dst_origin[3] */,
			
 
				+    const size_t *       /* region[3] */,
			
 
				+    cl_uint              /* num_events_in_wait_list */,
			
 
				+    const cl_event *     /* event_wait_list */,
			
 
				+    cl_event *           /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueCopyImageToBuffer)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_mem           /* src_image */,
			
 
				+    cl_mem           /* dst_buffer */,
			
 
				+    const size_t *   /* src_origin[3] */,
			
 
				+    const size_t *   /* region[3] */,
			
 
				+    size_t           /* dst_offset */,
			
 
				+    cl_uint          /* num_events_in_wait_list */,
			
 
				+    const cl_event * /* event_wait_list */,
			
 
				+    cl_event *       /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueCopyBufferToImage)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_mem           /* src_buffer */,
			
 
				+    cl_mem           /* dst_image */,
			
 
				+    size_t           /* src_offset */,
			
 
				+    const size_t *   /* dst_origin[3] */,
			
 
				+    const size_t *   /* region[3] */,
			
 
				+    cl_uint          /* num_events_in_wait_list */,
			
 
				+    const cl_event * /* event_wait_list */,
			
 
				+    cl_event *       /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY void * (CL_API_CALL*clEnqueueMapBuffer)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_mem           /* buffer */,
			
 
				+    cl_bool          /* blocking_map */,
			
 
				+    cl_map_flags     /* map_flags */,
			
 
				+    size_t           /* offset */,
			
 
				+    size_t           /* cb */,
			
 
				+    cl_uint          /* num_events_in_wait_list */,
			
 
				+    const cl_event * /* event_wait_list */,
			
 
				+    cl_event *       /* event */,
			
 
				+    cl_int *         /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY void * (CL_API_CALL*clEnqueueMapImage)(
			
 
				+    cl_command_queue  /* command_queue */,
			
 
				+    cl_mem            /* image */,
			
 
				+    cl_bool           /* blocking_map */,
			
 
				+    cl_map_flags      /* map_flags */,
			
 
				+    const size_t *    /* origin[3] */,
			
 
				+    const size_t *    /* region[3] */,
			
 
				+    size_t *          /* image_row_pitch */,
			
 
				+    size_t *          /* image_slice_pitch */,
			
 
				+    cl_uint           /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */,
			
 
				+    cl_int *          /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueUnmapMemObject)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_mem           /* memobj */,
			
 
				+    void *           /* mapped_ptr */,
			
 
				+    cl_uint          /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueNDRangeKernel)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_kernel        /* kernel */,
			
 
				+    cl_uint          /* work_dim */,
			
 
				+    const size_t *   /* global_work_offset */,
			
 
				+    const size_t *   /* global_work_size */,
			
 
				+    const size_t *   /* local_work_size */,
			
 
				+    cl_uint          /* num_events_in_wait_list */,
			
 
				+    const cl_event * /* event_wait_list */,
			
 
				+    cl_event *       /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueTask)(
			
 
				+    cl_command_queue  /* command_queue */,
			
 
				+    cl_kernel         /* kernel */,
			
 
				+    cl_uint           /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueNativeKernel)(
			
 
				+    cl_command_queue  /* command_queue */,
			
 
				+    void (*user_func)(void *),
			
 
				+    void *            /* args */,
			
 
				+    size_t            /* cb_args */,
			
 
				+    cl_uint           /* num_mem_objects */,
			
 
				+    const cl_mem *    /* mem_list */,
			
 
				+    const void **     /* args_mem_loc */,
			
 
				+    cl_uint           /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueMarker)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueWaitForEvents)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_uint          /* num_events */,
			
 
				+    const cl_event * /* event_list */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueBarrier)(
			
 
				+    cl_command_queue /* command_queue */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY void * (CL_API_CALL*clGetExtensionFunctionAddress)(
			
 
				+    const char * /* func_name */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateFromGLBuffer)(
			
 
				+    cl_context     /* context */,
			
 
				+    cl_mem_flags   /* flags */,
			
 
				+    cl_GLuint      /* bufobj */,
			
 
				+    int *          /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateFromGLTexture2D)(
			
 
				+    cl_context      /* context */,
			
 
				+    cl_mem_flags    /* flags */,
			
 
				+    cl_GLenum       /* target */,
			
 
				+    cl_GLint        /* miplevel */,
			
 
				+    cl_GLuint       /* texture */,
			
 
				+    cl_int *        /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateFromGLTexture3D)(
			
 
				+    cl_context      /* context */,
			
 
				+    cl_mem_flags    /* flags */,
			
 
				+    cl_GLenum       /* target */,
			
 
				+    cl_GLint        /* miplevel */,
			
 
				+    cl_GLuint       /* texture */,
			
 
				+    cl_int *        /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateFromGLRenderbuffer)(
			
 
				+    cl_context   /* context */,
			
 
				+    cl_mem_flags /* flags */,
			
 
				+    cl_GLuint    /* renderbuffer */,
			
 
				+    cl_int *     /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetGLObjectInfo)(
			
 
				+    cl_mem                /* memobj */,
			
 
				+    cl_gl_object_type *   /* gl_object_type */,
			
 
				+    cl_GLuint *              /* gl_object_name */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetGLTextureInfo)(
			
 
				+    cl_mem               /* memobj */,
			
 
				+    cl_gl_texture_info   /* param_name */,
			
 
				+    size_t               /* param_value_size */,
			
 
				+    void *               /* param_value */,
			
 
				+    size_t *             /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueAcquireGLObjects)(
			
 
				+    cl_command_queue      /* command_queue */,
			
 
				+    cl_uint               /* num_objects */,
			
 
				+    const cl_mem *        /* mem_objects */,
			
 
				+    cl_uint               /* num_events_in_wait_list */,
			
 
				+    const cl_event *      /* event_wait_list */,
			
 
				+    cl_event *            /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueReleaseGLObjects)(
			
 
				+    cl_command_queue      /* command_queue */,
			
 
				+    cl_uint               /* num_objects */,
			
 
				+    const cl_mem *        /* mem_objects */,
			
 
				+    cl_uint               /* num_events_in_wait_list */,
			
 
				+    const cl_event *      /* event_wait_list */,
			
 
				+    cl_event *            /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetGLContextInfoKHR)(
			
 
				+    const cl_context_properties * /* properties */,
			
 
				+    cl_gl_context_info            /* param_name */,
			
 
				+    size_t                        /* param_value_size */,
			
 
				+    void *                        /* param_value */,
			
 
				+    size_t *                      /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown75)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown76)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown77)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown78)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown79)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown80)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clSetEventCallback)(
			
 
				+    cl_event    /* event */,
			
 
				+    cl_int      /* command_exec_callback_type */,
			
 
				+    void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
			
 
				+    void *      /* user_data */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateSubBuffer)(
			
 
				+    cl_mem                   /* buffer */,
			
 
				+    cl_mem_flags             /* flags */,
			
 
				+    cl_buffer_create_type    /* buffer_create_type */,
			
 
				+    const void *             /* buffer_create_info */,
			
 
				+    cl_int *                 /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clSetMemObjectDestructorCallback)(
			
 
				+    cl_mem /* memobj */,
			
 
				+    void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
			
 
				+    void * /*user_data */ )             CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_event (CL_API_CALL*clCreateUserEvent)(
			
 
				+    cl_context    /* context */,
			
 
				+    cl_int *      /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clSetUserEventStatus)(
			
 
				+    cl_event   /* event */,
			
 
				+    cl_int     /* execution_status */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueReadBufferRect)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* buffer */,
			
 
				+    cl_bool             /* blocking_read */,
			
 
				+    const size_t *      /* buffer_origin */,
			
 
				+    const size_t *      /* host_origin */,
			
 
				+    const size_t *      /* region */,
			
 
				+    size_t              /* buffer_row_pitch */,
			
 
				+    size_t              /* buffer_slice_pitch */,
			
 
				+    size_t              /* host_row_pitch */,
			
 
				+    size_t              /* host_slice_pitch */,
			
 
				+    void *              /* ptr */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueWriteBufferRect)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* buffer */,
			
 
				+    cl_bool             /* blocking_write */,
			
 
				+    const size_t *      /* buffer_origin */,
			
 
				+    const size_t *      /* host_origin */,
			
 
				+    const size_t *      /* region */,
			
 
				+    size_t              /* buffer_row_pitch */,
			
 
				+    size_t              /* buffer_slice_pitch */,
			
 
				+    size_t              /* host_row_pitch */,
			
 
				+    size_t              /* host_slice_pitch */,
			
 
				+    const void *        /* ptr */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueCopyBufferRect)(
			
 
				+    cl_command_queue    /* command_queue */,
			
 
				+    cl_mem              /* src_buffer */,
			
 
				+    cl_mem              /* dst_buffer */,
			
 
				+    const size_t *      /* src_origin */,
			
 
				+    const size_t *      /* dst_origin */,
			
 
				+    const size_t *      /* region */,
			
 
				+    size_t              /* src_row_pitch */,
			
 
				+    size_t              /* src_slice_pitch */,
			
 
				+    size_t              /* dst_row_pitch */,
			
 
				+    size_t              /* dst_slice_pitch */,
			
 
				+    cl_uint             /* num_events_in_wait_list */,
			
 
				+    const cl_event *    /* event_wait_list */,
			
 
				+    cl_event *          /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*    clCreateSubDevicesEXT)(
			
 
				+    cl_device_id /*in_device*/,
			
 
				+    const cl_device_partition_property_ext * /* properties */,
			
 
				+    cl_uint /*num_entries*/,
			
 
				+    cl_device_id * /*out_devices*/,
			
 
				+    cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*    clRetainDeviceEXT)(
			
 
				+    cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*    clReleaseDeviceEXT)(
			
 
				+    cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown92)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clCreateSubDevices)(
			
 
				+    cl_device_id                         /* in_device */,
			
 
				+    const cl_device_partition_property * /* properties */,
			
 
				+    cl_uint                              /* num_devices */,
			
 
				+    cl_device_id *                       /* out_devices */,
			
 
				+    cl_uint *                            /* num_devices_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clRetainDevice)(
			
 
				+    cl_device_id /* device */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clReleaseDevice)(
			
 
				+    cl_device_id /* device */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateImage)(
			
 
				+    cl_context              /* context */,
			
 
				+    cl_mem_flags            /* flags */,
			
 
				+    const cl_image_format * /* image_format */,
			
 
				+    const cl_image_desc *   /* image_desc */,
			
 
				+    void *                  /* host_ptr */,
			
 
				+    cl_int *                /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_program (CL_API_CALL*clCreateProgramWithBuiltInKernels)(
			
 
				+    cl_context            /* context */,
			
 
				+    cl_uint               /* num_devices */,
			
 
				+    const cl_device_id *  /* device_list */,
			
 
				+    const char *          /* kernel_names */,
			
 
				+    cl_int *              /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clCompileProgram)(
			
 
				+    cl_program           /* program */,
			
 
				+    cl_uint              /* num_devices */,
			
 
				+    const cl_device_id * /* device_list */,
			
 
				+    const char *         /* options */,
			
 
				+    cl_uint              /* num_input_headers */,
			
 
				+    const cl_program *   /* input_headers */,
			
 
				+    const char **        /* header_include_names */,
			
 
				+    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
			
 
				+    void *               /* user_data */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_program (CL_API_CALL*clLinkProgram)(
			
 
				+    cl_context           /* context */,
			
 
				+    cl_uint              /* num_devices */,
			
 
				+    const cl_device_id * /* device_list */,
			
 
				+    const char *         /* options */,
			
 
				+    cl_uint              /* num_input_programs */,
			
 
				+    const cl_program *   /* input_programs */,
			
 
				+    void (CL_CALLBACK *  /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
			
 
				+    void *               /* user_data */,
			
 
				+    cl_int *             /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clUnloadPlatformCompiler)(
			
 
				+    cl_platform_id /* platform */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clGetKernelArgInfo)(
			
 
				+    cl_kernel       /* kernel */,
			
 
				+    cl_uint         /* arg_indx */,
			
 
				+    cl_kernel_arg_info  /* param_name */,
			
 
				+    size_t          /* param_value_size */,
			
 
				+    void *          /* param_value */,
			
 
				+    size_t *        /* param_value_size_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueFillBuffer)(
			
 
				+    cl_command_queue   /* command_queue */,
			
 
				+    cl_mem             /* buffer */,
			
 
				+    const void *       /* pattern */,
			
 
				+    size_t             /* pattern_size */,
			
 
				+    size_t             /* offset */,
			
 
				+    size_t             /* size */,
			
 
				+    cl_uint            /* num_events_in_wait_list */,
			
 
				+    const cl_event *   /* event_wait_list */,
			
 
				+    cl_event *         /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueFillImage)(
			
 
				+    cl_command_queue   /* command_queue */,
			
 
				+    cl_mem             /* image */,
			
 
				+    const void *       /* fill_color */,
			
 
				+    const size_t *     /* origin[3] */,
			
 
				+    const size_t *     /* region[3] */,
			
 
				+    cl_uint            /* num_events_in_wait_list */,
			
 
				+    const cl_event *   /* event_wait_list */,
			
 
				+    cl_event *         /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueMigrateMemObjects)(
			
 
				+    cl_command_queue       /* command_queue */,
			
 
				+    cl_uint                /* num_mem_objects */,
			
 
				+    const cl_mem *         /* mem_objects */,
			
 
				+    cl_mem_migration_flags /* flags */,
			
 
				+    cl_uint                /* num_events_in_wait_list */,
			
 
				+    const cl_event *       /* event_wait_list */,
			
 
				+    cl_event *             /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueMarkerWithWaitList)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_uint           /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL*clEnqueueBarrierWithWaitList)(
			
 
				+    cl_command_queue /* command_queue */,
			
 
				+    cl_uint           /* num_events_in_wait_list */,
			
 
				+    const cl_event *  /* event_wait_list */,
			
 
				+    cl_event *        /* event */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY void * (CL_API_CALL*
			
 
				+  clGetExtensionFunctionAddressForPlatform)(
			
 
				+    cl_platform_id /* platform */,
			
 
				+    const char *   /* func_name */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_mem (CL_API_CALL*clCreateFromGLTexture)(
			
 
				+    cl_context      /* context */,
			
 
				+    cl_mem_flags    /* flags */,
			
 
				+    cl_GLenum       /* target */,
			
 
				+    cl_GLint        /* miplevel */,
			
 
				+    cl_GLuint       /* texture */,
			
 
				+    cl_int *        /* errcode_ret */
			
 
				+  ) CL_API_SUFFIX__VERSION_1_2;
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown109)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown110)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown111)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown112)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown113)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown114)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown115)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown116)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown117)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown118)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown119)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown120)(
			
 
				+    void);
			
 
				+
			
 
				+  CL_API_ENTRY cl_int (CL_API_CALL* clUnknown121)(
			
 
				+    void);
			
 
				+
			
 
				+};
			
 
				+
			
--- a/socl/src/socl.c
+++ b/socl/src/socl.c
@@ -1,6 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				+ * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -16,13 +18,140 @@
 
				 
			
 
				 #include "socl.h"
			
 
				 
			
 
				-struct _cl_platform_id socl_platform = {};
			
 
				+struct _cl_icd_dispatch socl_master_dispatch = {
			
 
				+  soclGetPlatformIDs,
			
 
				+  soclGetPlatformInfo,
			
 
				+  soclGetDeviceIDs,
			
 
				+  soclGetDeviceInfo,
			
 
				+  soclCreateContext,
			
 
				+  soclCreateContextFromType,
			
 
				+  soclRetainContext,
			
 
				+  soclReleaseContext,
			
 
				+  soclGetContextInfo,
			
 
				+  soclCreateCommandQueue,
			
 
				+  soclRetainCommandQueue,
			
 
				+  soclReleaseCommandQueue,
			
 
				+  soclGetCommandQueueInfo,
			
 
				+  soclSetCommandQueueProperty,
			
 
				+  soclCreateBuffer,
			
 
				+  soclCreateImage2D,
			
 
				+  soclCreateImage3D,
			
 
				+  soclRetainMemObject,
			
 
				+  soclReleaseMemObject,
			
 
				+  soclGetSupportedImageFormats,
			
 
				+  soclGetMemObjectInfo,
			
 
				+  soclGetImageInfo,
			
 
				+  soclCreateSampler,
			
 
				+  soclRetainSampler,
			
 
				+  soclReleaseSampler,
			
 
				+  soclGetSamplerInfo,
			
 
				+  soclCreateProgramWithSource,
			
 
				+  soclCreateProgramWithBinary,
			
 
				+  soclRetainProgram,
			
 
				+  soclReleaseProgram,
			
 
				+  soclBuildProgram,
			
 
				+  soclUnloadCompiler,
			
 
				+  soclGetProgramInfo,
			
 
				+  soclGetProgramBuildInfo,
			
 
				+  soclCreateKernel,
			
 
				+  soclCreateKernelsInProgram,
			
 
				+  soclRetainKernel,
			
 
				+  soclReleaseKernel,
			
 
				+  soclSetKernelArg,
			
 
				+  soclGetKernelInfo,
			
 
				+  soclGetKernelWorkGroupInfo,
			
 
				+  soclWaitForEvents,
			
 
				+  soclGetEventInfo,
			
 
				+  soclRetainEvent,
			
 
				+  soclReleaseEvent,
			
 
				+  soclGetEventProfilingInfo,
			
 
				+  soclFlush,
			
 
				+  soclFinish,
			
 
				+  soclEnqueueReadBuffer,
			
 
				+  soclEnqueueWriteBuffer,
			
 
				+  soclEnqueueCopyBuffer,
			
 
				+  soclEnqueueReadImage,
			
 
				+  soclEnqueueWriteImage,
			
 
				+  soclEnqueueCopyImage,
			
 
				+  soclEnqueueCopyImageToBuffer,
			
 
				+  soclEnqueueCopyBufferToImage,
			
 
				+  soclEnqueueMapBuffer,
			
 
				+  soclEnqueueMapImage,
			
 
				+  soclEnqueueUnmapMemObject,
			
 
				+  soclEnqueueNDRangeKernel,
			
 
				+  soclEnqueueTask,
			
 
				+  soclEnqueueNativeKernel,
			
 
				+  soclEnqueueMarker,
			
 
				+  soclEnqueueWaitForEvents,
			
 
				+  soclEnqueueBarrier,
			
 
				+  soclGetExtensionFunctionAddress,
			
 
				+  (void *) NULL, //  clCreateFromGLBuffer,
			
 
				+  (void *) NULL, //  clCreateFromGLTexture2D,
			
 
				+  (void *) NULL, //  clCreateFromGLTexture3D,
			
 
				+  (void *) NULL, //  clCreateFromGLRenderbuffer,
			
 
				+  (void *) NULL, //  clGetGLObjectInfo,
			
 
				+  (void *) NULL, //  clGetGLTextureInfo,
			
 
				+  (void *) NULL, //  clEnqueueAcquireGLObjects,
			
 
				+  (void *) NULL, //  clEnqueueReleaseGLObjects,
			
 
				+  (void *) NULL, //  clGetGLContextInfoKHR,
			
 
				+  (void *) NULL, //
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL, //  clSetEventCallback,
			
 
				+  (void *) NULL, //  clCreateSubBuffer,
			
 
				+  (void *) NULL, //  clSetMemObjectDestructorCallback,
			
 
				+  (void *) NULL, //  clCreateUserEvent,
			
 
				+  (void *) NULL, //  clSetUserEventStatus,
			
 
				+  (void *) NULL, //  clEnqueueReadBufferRect,
			
 
				+  (void *) NULL, //  clEnqueueWriteBufferRect,
			
 
				+  (void *) NULL, //  clEnqueueCopyBufferRect,
			
 
				+  (void *) NULL, //  clCreateSubDevicesEXT,
			
 
				+  (void *) NULL, //  clRetainDeviceEXT,
			
 
				+  (void *) NULL, //  clReleaseDeviceEXT,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL, //  clCreateSubDevices,
			
 
				+  (void *) NULL, //  clRetainDevice,
			
 
				+  (void *) NULL, //  clReleaseDevice,
			
 
				+  (void *) NULL, //  clCreateImage,
			
 
				+  (void *) NULL, //  clCreateProgramWithBuiltInKernels,
			
 
				+  (void *) NULL, //  clCompileProgram,
			
 
				+  (void *) NULL, //  clLinkProgram,
			
 
				+  (void *) NULL, //  clUnloadPlatformCompiler,
			
 
				+  (void *) NULL, //  clGetKernelArgInfo,
			
 
				+  (void *) NULL, //  clEnqueueFillBuffer,
			
 
				+  (void *) NULL, //  clEnqueueFillImage,
			
 
				+  (void *) NULL, //  clEnqueueMigrateMemObjects,
			
 
				+  (void *) NULL, //  clEnqueueMarkerWithWaitList,
			
 
				+  (void *) NULL, //  clEnqueueBarrierWithWaitList,
			
 
				+  (void *) NULL, //  clGetExtensionFunctionAddressForPlatform,
			
 
				+  (void *) NULL, //  clCreateFromGLTexture,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL,
			
 
				+  (void *) NULL
			
 
				+};
			
 
				 
			
 
				-const char * __attribute__ ((aligned (16))) SOCL_PROFILE = "FULL_PROFILE" ;
			
 
				+
			
 
				+struct _cl_platform_id socl_platform = {&socl_master_dispatch};
			
 
				+
			
 
				+const char * __attribute__ ((aligned (16))) SOCL_PROFILE = "FULL_PROFILE";
			
 
				 const char * __attribute__ ((aligned (16))) SOCL_VERSION = "OpenCL 1.0 StarPU Edition (0.0.1)";
			
 
				 const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_NAME    = "StarPU Platform";
			
 
				 const char * __attribute__ ((aligned (16))) SOCL_VENDOR  = "INRIA";
			
 
				-const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_EXTENSIONS = "";
			
 
				+const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_EXTENSIONS = "cl_khr_icd";
			
 
				+const char * __attribute__ ((aligned (16))) SOCL_PLATFORM_ICD_SUFFIX_KHR ="SOCL";
			
 
				 
			
 
				 
			
 
				 /* Command queues with profiling enabled
			
--- a/socl/src/socl.h
+++ b/socl/src/socl.h
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010,2011 University of Bordeaux
			
 
				+ * Copyright (C) 2010-2012 University of Bordeaux
			
 
				+ * Copyright (C) 2012 CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -17,20 +18,13 @@
 
				 #ifndef SOCL_H
			
 
				 #define SOCL_H
			
 
				 
			
 
				-#ifndef CL_HEADERS
			
 
				-#include "CL/cl.h"
			
 
				-#else
			
 
				-#include CL_HEADERS "CL/cl.h"
			
 
				-#endif
			
 
				-
			
 
				-/* Additional command type */
			
 
				-#define CL_COMMAND_BARRIER 0x99987
			
 
				-
			
 
				 #include <string.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <stdint.h>
			
 
				 #include <unistd.h>
			
 
				 #include <pthread.h>
			
 
				+#include "CL/cl.h"
			
 
				+#include "ocl_icd.h"
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_opencl.h>
			
@@ -66,6 +60,7 @@ typedef struct entity * entity;
 
				 
			
 
				 
			
 
				 struct entity {
			
 
				+  struct _cl_icd_dispatch * dispatch;
			
 
				   /* Reference count */
			
 
				   size_t refs;
			
 
				 
			
@@ -81,7 +76,8 @@ struct entity {
 
				  * this macro as their first field */
			
 
				 #define CL_ENTITY struct entity _entity;
			
 
				 
			
 
				-struct _cl_platform_id {};
			
 
				+
			
 
				+struct _cl_platform_id {struct _cl_icd_dispatch *dispatch;};
			
 
				 
			
 
				 #define RETURN_EVENT(cmd, event) \
			
 
				 	if (event != NULL) { \
			
@@ -104,17 +100,16 @@ struct _cl_platform_id {};
 
				 
			
 
				 #define MAY_BLOCK_CUSTOM(blocking,event) \
			
 
				 	if ((blocking) == CL_TRUE) {\
			
 
				-		cl_event ev = (event);\
			
 
				-		soclWaitForEvents(1, &ev);\
			
 
				+		soclWaitForEvents(1, &(event));\
			
 
				 	}
			
 
				 
			
 
				 /* Constants */
			
 
				-struct _cl_platform_id socl_platform;
			
 
				 const char * SOCL_PROFILE;
			
 
				 const char * SOCL_VERSION;
			
 
				 const char * SOCL_PLATFORM_NAME;
			
 
				 const char * SOCL_VENDOR;
			
 
				 const char * SOCL_PLATFORM_EXTENSIONS;
			
 
				+const char * SOCL_PLATFORM_ICD_SUFFIX_KHR;
			
 
				 
			
 
				 struct _cl_context {
			
 
				   CL_ENTITY;
			
@@ -746,4 +741,12 @@ soclEnqueueBarrier(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_
 
				 extern CL_API_ENTRY void * CL_API_CALL
			
 
				 soclGetExtensionFunctionAddress(const char * /* func_name */) CL_API_SUFFIX__VERSION_1_0;
			
 
				 
			
 
				+extern CL_API_ENTRY cl_int CL_API_CALL
			
 
				+soclIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
			
 
				+                 cl_platform_id * /* platforms */,
			
 
				+                 cl_uint *        /* num_platforms */) CL_EXT_SUFFIX__VERSION_1_0;
			
 
				+
			
 
				+
			
 
				+struct _cl_icd_dispatch socl_master_dispatch;
			
 
				+struct _cl_platform_id socl_platform;
			
 
				 #endif /* SOCL_H */
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -58,7 +58,6 @@ noinst_HEADERS = 						\
 
				 	core/dependencies/data_concurrency.h			\
			
 
				 	core/dependencies/cg.h					\
			
 
				 	core/dependencies/tags.h				\
			
 
				-	core/dependencies/htable.h				\
			
 
				 	core/dependencies/implicit_data_deps.h			\
			
 
				 	core/sched_policy.h					\
			
 
				 	core/sched_ctx.h					\
			
@@ -91,8 +90,6 @@ noinst_HEADERS = 						\
 
				 	datawizard/interfaces/data_interface.h			\
			
 
				 	common/barrier.h					\
			
 
				 	common/timing.h						\
			
 
				-	common/htable32.h					\
			
 
				-	common/htable64.h					\
			
 
				 	common/list.h						\
			
 
				 	common/rwlock.h						\
			
 
				 	common/starpu_spinlock.h				\
			
@@ -123,8 +120,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 
				 	common/barrier.c					\
			
 
				 	common/barrier_counter.c				\
			
 
				 	common/hash.c 						\
			
 
				-	common/htable32.c					\
			
 
				-	common/htable64.c					\
			
 
				 	common/rwlock.c						\
			
 
				 	common/starpu_spinlock.c				\
			
 
				 	common/timing.c						\
			
@@ -144,7 +139,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 
				 	core/dependencies/implicit_data_deps.c			\
			
 
				 	core/dependencies/tags.c				\
			
 
				 	core/dependencies/task_deps.c				\
			
 
				-	core/dependencies/htable.c				\
			
 
				 	core/dependencies/data_concurrency.c			\
			
 
				 	core/perfmodel/perfmodel_history.c			\
			
 
				 	core/perfmodel/perfmodel_bus.c				\
			
--- a/src/common/htable32.c
+++ b/src/common/htable32.c
@@ -1,130 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <common/config.h>
			
 
				-#include <common/htable32.h>
			
 
				-#include <stdint.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-void *_starpu_htbl_search_32(struct starpu_htbl32_node *htbl, uint32_t key)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	unsigned keysize = sizeof(uint32_t)*8;
			
 
				-
			
 
				-	struct starpu_htbl32_node *current_htbl = htbl;
			
 
				-
			
 
				-	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				-	uint32_t mask = (1<<_STARPU_HTBL32_NODE_SIZE)-1;
			
 
				-
			
 
				-	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL32_NODE_SIZE)
			
 
				-	{
			
 
				-		//	printf("search : current bit = %d \n", currentbit);
			
 
				-		if (STARPU_UNLIKELY(current_htbl == NULL))
			
 
				-			return NULL;
			
 
				-
			
 
				-		/* 0000000000001111
			
 
				-		 *     | currentbit
			
 
				-		 * 0000111100000000 = offloaded_mask
			
 
				-		 *         |last_currentbit
			
 
				-		 * */
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			keysize - (currentbit + _STARPU_HTBL32_NODE_SIZE);
			
 
				-		uint32_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl = current_htbl->children[current_index];
			
 
				-	}
			
 
				-
			
 
				-	return current_htbl;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * returns the previous value of the tag, or NULL else
			
 
				- */
			
 
				-
			
 
				-void *_starpu_htbl_insert_32(struct starpu_htbl32_node **htbl, uint32_t key, void *entry)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	unsigned keysize = sizeof(uint32_t)*8;
			
 
				-
			
 
				-	struct starpu_htbl32_node **current_htbl_ptr = htbl;
			
 
				-
			
 
				-	/* 000000000001111 with HTBL_NODE_SIZE 1's */
			
 
				-	uint32_t mask = (1<<_STARPU_HTBL32_NODE_SIZE)-1;
			
 
				-
			
 
				-	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL32_NODE_SIZE)
			
 
				-	{
			
 
				-		//printf("insert : current bit = %d \n", currentbit);
			
 
				-		if (*current_htbl_ptr == NULL)
			
 
				-		{
			
 
				-			/* TODO pad to change that 1 into 16 ? */
			
 
				-			*current_htbl_ptr = (struct starpu_htbl32_node*)calloc(sizeof(struct starpu_htbl32_node), 1);
			
 
				-			STARPU_ASSERT(*current_htbl_ptr);
			
 
				-		}
			
 
				-
			
 
				-		/* 0000000000001111
			
 
				-		 *     | currentbit
			
 
				-		 * 0000111100000000 = offloaded_mask
			
 
				-		 *         |last_currentbit
			
 
				-		 * */
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			keysize - (currentbit + _STARPU_HTBL32_NODE_SIZE);
			
 
				-		uint32_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl_ptr =
			
 
				-			&((*current_htbl_ptr)->children[current_index]);
			
 
				-	}
			
 
				-
			
 
				-	/* current_htbl either contains NULL or a previous entry
			
 
				-	 * we overwrite it anyway */
			
 
				-	void *old_entry = *current_htbl_ptr;
			
 
				-	*current_htbl_ptr = (struct starpu_htbl32_node *) entry;
			
 
				-
			
 
				-	return old_entry;
			
 
				-}
			
 
				-
			
 
				-static void _starpu_htbl_destroy_32_bit(struct starpu_htbl32_node *htbl, unsigned bit, void (*remove)(void*))
			
 
				-{
			
 
				-	unsigned keysize = sizeof(uint32_t)*8;
			
 
				-	unsigned i;
			
 
				-
			
 
				-	if (!htbl)
			
 
				-		return;
			
 
				-
			
 
				-	if (bit >= keysize) {
			
 
				-		/* entry, delete it */
			
 
				-		if (remove)
			
 
				-			remove(htbl);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	for (i = 0; i < 1<<_STARPU_HTBL32_NODE_SIZE; i++) {
			
 
				-		_starpu_htbl_destroy_32_bit(htbl->children[i], bit+_STARPU_HTBL32_NODE_SIZE, remove);
			
 
				-	}
			
 
				-
			
 
				-	free(htbl);
			
 
				-}
			
 
				-void _starpu_htbl_destroy_32(struct starpu_htbl32_node *htbl, void (*remove)(void*))
			
 
				-{
			
 
				-	_starpu_htbl_destroy_32_bit(htbl, 0, remove);
			
 
				-}
			
--- a/src/common/htable32.h
+++ b/src/common/htable32.h
@@ -1,47 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __GENERIC_HTABLE_H__
			
 
				-#define __GENERIC_HTABLE_H__
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <stdio.h>
			
 
				-#include <assert.h>
			
 
				-
			
 
				-#define _STARPU_HTBL32_NODE_SIZE	8
			
 
				-
			
 
				-/* Hierarchical table: all nodes have a 2^8 arity . */
			
 
				-/* Note: this struct is used in include/starpu_perfmodel.h */
			
 
				-struct starpu_htbl32_node {
			
 
				-	unsigned nentries;
			
 
				-	struct starpu_htbl32_node *children[1<<_STARPU_HTBL32_NODE_SIZE];
			
 
				-};
			
 
				-
			
 
				-/* Look for a 32bit key into the hierchical table. Returns the entry if
			
 
				- * something is found, NULL otherwise. */
			
 
				-void *_starpu_htbl_search_32(struct starpu_htbl32_node *htbl, uint32_t key);
			
 
				-
			
 
				-/* Insert an entry indexed by the 32bit key into the hierarchical table.
			
 
				- * Returns the entry that was previously associated to that key if any, NULL
			
 
				- * otherwise. */
			
 
				-void *_starpu_htbl_insert_32(struct starpu_htbl32_node **htbl, uint32_t key, void *entry);
			
 
				-
			
 
				-/* Delete the content of the table, `remove' being called on each element */
			
 
				-void _starpu_htbl_destroy_32(struct starpu_htbl32_node *htbl, void (*remove)(void*));
			
 
				-
			
 
				-#endif // __GENERIC_HTABLE_H__
			
--- a/src/common/htable64.c
+++ b/src/common/htable64.c
@@ -1,106 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <starpu.h>
			
 
				-#include <common/config.h>
			
 
				-#include <common/htable64.h>
			
 
				-#include <stdint.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-void *_starpu_htbl_search_64(struct starpu_htbl64_node *htbl, uint64_t key)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	unsigned keysize = sizeof(uint64_t)*8;
			
 
				-
			
 
				-	struct starpu_htbl64_node *current_htbl = htbl;
			
 
				-	uint64_t mask = (1ULL<<_STARPU_HTBL64_NODE_SIZE)-1;
			
 
				-
			
 
				-	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL64_NODE_SIZE)
			
 
				-	{
			
 
				-		if (STARPU_UNLIKELY(current_htbl == NULL))
			
 
				-			return NULL;
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			keysize - (currentbit + _STARPU_HTBL64_NODE_SIZE);
			
 
				-		uint64_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl = current_htbl->children[current_index];
			
 
				-	}
			
 
				-	return current_htbl;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * returns the previous value of the tag, or NULL else
			
 
				- */
			
 
				-
			
 
				-void *_starpu_htbl_insert_64(struct starpu_htbl64_node **htbl, uint64_t key, void *entry)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	unsigned keysize = sizeof(uint64_t)*8;
			
 
				-	struct starpu_htbl64_node **current_htbl_ptr = htbl;
			
 
				-
			
 
				-	uint64_t mask = (1ULL<<_STARPU_HTBL64_NODE_SIZE)-1;
			
 
				-	for(currentbit = 0; currentbit < keysize; currentbit+=_STARPU_HTBL64_NODE_SIZE)
			
 
				-	{
			
 
				-		if (*current_htbl_ptr == NULL)
			
 
				-		{
			
 
				-			*current_htbl_ptr = (struct starpu_htbl64_node*)calloc(sizeof(struct starpu_htbl64_node), 1);
			
 
				-			STARPU_ASSERT(*current_htbl_ptr);
			
 
				-		}
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			keysize - (currentbit + _STARPU_HTBL64_NODE_SIZE);
			
 
				-		uint64_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(key & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl_ptr =
			
 
				-			&((*current_htbl_ptr)->children[current_index]);
			
 
				-	}
			
 
				-	void *old_entry = *current_htbl_ptr;
			
 
				-	*current_htbl_ptr = (struct starpu_htbl64_node *) entry;
			
 
				-
			
 
				-	return old_entry;
			
 
				-}
			
 
				-
			
 
				-static void _starpu_htbl_destroy_64_bit(struct starpu_htbl64_node *htbl, unsigned bit, void (*remove)(void*))
			
 
				-{
			
 
				-	unsigned keysize = sizeof(uint64_t)*8;
			
 
				-	unsigned i;
			
 
				-
			
 
				-	if (!htbl)
			
 
				-		return;
			
 
				-
			
 
				-	if (bit >= keysize) {
			
 
				-		/* entry, delete it */
			
 
				-		if (remove)
			
 
				-			remove(htbl);
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	for (i = 0; i < 1ULL<<_STARPU_HTBL64_NODE_SIZE; i++) {
			
 
				-		_starpu_htbl_destroy_64_bit(htbl->children[i], bit+_STARPU_HTBL64_NODE_SIZE, remove);
			
 
				-	}
			
 
				-
			
 
				-	free(htbl);
			
 
				-}
			
 
				-void _starpu_htbl_destroy_64(struct starpu_htbl64_node *htbl, void (*remove)(void*))
			
 
				-{
			
 
				-	_starpu_htbl_destroy_64_bit(htbl, 0, remove);
			
 
				-}
			
--- a/src/common/htable64.h
+++ b/src/common/htable64.h
@@ -1,46 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __GENERIC_HTABLE_H__
			
 
				-#define __GENERIC_HTABLE_H__
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <stdio.h>
			
 
				-#include <assert.h>
			
 
				-
			
 
				-#define _STARPU_HTBL64_NODE_SIZE	8 
			
 
				-
			
 
				-/* Hierarchical table: all nodes have a 2^8 arity . */
			
 
				-struct starpu_htbl64_node {
			
 
				-	unsigned nentries;
			
 
				-	struct starpu_htbl64_node *children[1ULL<<_STARPU_HTBL64_NODE_SIZE];
			
 
				-};
			
 
				-
			
 
				-/* Look for a 64bit key into the hierchical table. Returns the entry if
			
 
				- * something is found, NULL otherwise. */
			
 
				-void *_starpu_htbl_search_64(struct starpu_htbl64_node *htbl, uint64_t key);
			
 
				-
			
 
				-/* Insert an entry indexed by the 64bit key into the hierarchical table.
			
 
				- * Returns the entry that was previously associated to that key if any, NULL
			
 
				- * otherwise. */
			
 
				-void *_starpu_htbl_insert_64(struct starpu_htbl64_node **htbl, uint64_t key, void *entry);
			
 
				-
			
 
				-/* Delete the content of the table, `remove' being called on each element */
			
 
				-void _starpu_htbl_destroy_64(struct starpu_htbl64_node *htbl, void (*remove)(void*));
			
 
				-
			
 
				-#endif // __GENERIC_HTABLE_H__
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -30,7 +30,7 @@
 
				 #ifdef STARPU_VERBOSE
			
 
				 #  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__func__ ,##args); fflush(stderr); }} while(0)
			
 
				 #else
			
 
				-#  define _STARPU_DEBUG(fmt, args ...)
			
 
				+#  define _STARPU_DEBUG(fmt, args ...) do { } while (0)
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_VERBOSE0
			
--- a/src/core/dependencies/dependencies.c
+++ b/src/core/dependencies/dependencies.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,7 +19,6 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/dependencies/tags.h>
			
 
				-#include <core/dependencies/htable.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
--- a/src/core/dependencies/htable.c
+++ b/src/core/dependencies/htable.c
@@ -1,197 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <core/dependencies/htable.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-void *_starpu_htbl_search_tag(struct _starpu_htbl_node *htbl, starpu_tag_t tag)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	struct _starpu_htbl_node *current_htbl = htbl;
			
 
				-
			
 
				-	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
			
 
				-	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
			
 
				-
			
 
				-	for(currentbit = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE)
			
 
				-	{
			
 
				-	//	printf("search : current bit = %d \n", currentbit);
			
 
				-		if (STARPU_UNLIKELY(current_htbl == NULL))
			
 
				-			return NULL;
			
 
				-
			
 
				-		/* 0000000000001111
			
 
				-		 *     | currentbit
			
 
				-		 * 0000111100000000 = offloaded_mask
			
 
				-		 *         |last_currentbit
			
 
				-		 * */
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
			
 
				-		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl = current_htbl->children[current_index];
			
 
				-	}
			
 
				-
			
 
				-	return current_htbl;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * returns the previous value of the tag, or NULL else
			
 
				- */
			
 
				-
			
 
				-void *_starpu_htbl_insert_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag, void *entry)
			
 
				-{
			
 
				-	unsigned currentbit;
			
 
				-	struct _starpu_htbl_node **current_htbl_ptr = htbl;
			
 
				-	struct _starpu_htbl_node *previous_htbl_ptr = NULL;
			
 
				-
			
 
				-	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
			
 
				-	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
			
 
				-
			
 
				-	for(currentbit = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE)
			
 
				-	{
			
 
				-		if (*current_htbl_ptr == NULL)
			
 
				-		{
			
 
				-			/* TODO pad to change that 1 into 16 ? */
			
 
				-			*current_htbl_ptr = (struct _starpu_htbl_node *) calloc(1, sizeof(struct _starpu_htbl_node));
			
 
				-			STARPU_ASSERT(*current_htbl_ptr);
			
 
				-
			
 
				-			if (previous_htbl_ptr)
			
 
				-				previous_htbl_ptr->nentries++;
			
 
				-		}
			
 
				-
			
 
				-		/* 0000000000001111
			
 
				-		 *     | currentbit
			
 
				-		 * 0000111100000000 = offloaded_mask
			
 
				-		 *         |last_currentbit
			
 
				-		 * */
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
			
 
				-		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		previous_htbl_ptr = *current_htbl_ptr;
			
 
				-		current_htbl_ptr =
			
 
				-			&((*current_htbl_ptr)->children[current_index]);
			
 
				-	}
			
 
				-
			
 
				-	/* current_htbl either contains NULL or a previous entry
			
 
				-	 * we overwrite it anyway */
			
 
				-	void *old_entry = *current_htbl_ptr;
			
 
				-	*current_htbl_ptr = (struct _starpu_htbl_node *) entry;
			
 
				-
			
 
				-	if (!old_entry)
			
 
				-		previous_htbl_ptr->nentries++;
			
 
				-
			
 
				-	return old_entry;
			
 
				-}
			
 
				-
			
 
				-/* returns the entry corresponding to the tag and remove it from the htbl */
			
 
				-void *_starpu_htbl_remove_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag)
			
 
				-{
			
 
				-	/* NB : if the entry is "NULL", we assume this means it is not present XXX */
			
 
				-	unsigned currentbit;
			
 
				-	struct _starpu_htbl_node **current_htbl_ptr_parent = htbl;
			
 
				-	struct _starpu_htbl_node *current_htbl_ptr = *current_htbl_ptr_parent;
			
 
				-
			
 
				-	/* remember the path to the tag */
			
 
				-	struct _starpu_htbl_node *path[(_STARPU_TAG_SIZE + _STARPU_HTBL_NODE_SIZE - 1)/(_STARPU_HTBL_NODE_SIZE)];
			
 
				-	struct _starpu_htbl_node **path_parent[(_STARPU_TAG_SIZE + _STARPU_HTBL_NODE_SIZE - 1)/(_STARPU_HTBL_NODE_SIZE)];
			
 
				-
			
 
				-	/* 000000000001111 with _STARPU_HTBL_NODE_SIZE 1's */
			
 
				-	starpu_tag_t mask = (1<<_STARPU_HTBL_NODE_SIZE)-1;
			
 
				-	int level, maxlevel;
			
 
				-	unsigned tag_is_present = 1;
			
 
				-
			
 
				-	for(currentbit = 0, level = 0; currentbit < _STARPU_TAG_SIZE; currentbit+=_STARPU_HTBL_NODE_SIZE, level++)
			
 
				-	{
			
 
				-		path_parent[level] = current_htbl_ptr_parent;
			
 
				-		path[level] = current_htbl_ptr;
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(!current_htbl_ptr))
			
 
				-		{
			
 
				-			tag_is_present = 0;
			
 
				-			break;
			
 
				-		}
			
 
				-
			
 
				-		/* 0000000000001111
			
 
				-		 *     | currentbit
			
 
				-		 * 0000111100000000 = offloaded_mask
			
 
				-		 *         |last_currentbit
			
 
				-		 * */
			
 
				-
			
 
				-		unsigned last_currentbit =
			
 
				-			_STARPU_TAG_SIZE - (currentbit + _STARPU_HTBL_NODE_SIZE);
			
 
				-		starpu_tag_t offloaded_mask = mask << last_currentbit;
			
 
				-		unsigned current_index =
			
 
				-			(tag & (offloaded_mask)) >> (last_currentbit);
			
 
				-
			
 
				-		current_htbl_ptr_parent = 
			
 
				-			&current_htbl_ptr->children[current_index];
			
 
				-		current_htbl_ptr = *current_htbl_ptr_parent;
			
 
				-	}
			
 
				-
			
 
				-	maxlevel = level;
			
 
				-	if (STARPU_UNLIKELY(!current_htbl_ptr))
			
 
				-		tag_is_present = 0;
			
 
				-
			
 
				-	void *old_entry = current_htbl_ptr;
			
 
				-
			
 
				-	if (tag_is_present)
			
 
				-	{
			
 
				-		/* the tag was in the htbl, so we have to unroll the search
			
 
				- 		 * to remove possibly useless htbl (internal) nodes */
			
 
				-		for (level = maxlevel - 1; level >= 0; level--)
			
 
				-		{
			
 
				-			path[level]->nentries--;
			
 
				-
			
 
				-			/* TODO use likely statements ... */
			
 
				-
			
 
				-			/* in case we do not remove that node, we do decrease its parents
			
 
				- 			 * number of entries */
			
 
				-			if (path[level]->nentries > 0)
			
 
				-				break;
			
 
				-
			
 
				-			/* we remove this node */
			
 
				-			free(path[level]);
			
 
				-			*(path_parent[level]) = NULL;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/* we return the entry if there was one */
			
 
				-	return old_entry;
			
 
				-}
			
 
				-
			
 
				-void _starpu_htbl_clear_tags(struct _starpu_htbl_node **htbl, unsigned level, void (*free_entry)(void *))
			
 
				-{
			
 
				-	unsigned i;
			
 
				-	struct _starpu_htbl_node *tbl = *htbl;
			
 
				-
			
 
				-	if (!tbl)
			
 
				-		return;
			
 
				-
			
 
				-	if (level * _STARPU_HTBL_NODE_SIZE < _STARPU_TAG_SIZE) {
			
 
				-		for (i = 0; i < 1<<_STARPU_HTBL_NODE_SIZE; i++)
			
 
				-			_starpu_htbl_clear_tags(&tbl->children[i], level + 1, free_entry);
			
 
				-		free(tbl);
			
 
				-	} else
			
 
				-		free_entry(tbl);
			
 
				-	*htbl = NULL;
			
 
				-}
			
--- a/src/core/dependencies/htable.h
+++ b/src/core/dependencies/htable.h
@@ -1,45 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#ifndef __HTABLE_H__
			
 
				-#define __HTABLE_H__
			
 
				-
			
 
				-/*
			
 
				- *	Define a hierarchical table to do the tag matching
			
 
				- */
			
 
				-
			
 
				-#include <stdint.h>
			
 
				-#include <stdlib.h>
			
 
				-#include <stdio.h>
			
 
				-#include <assert.h>
			
 
				-#include <core/dependencies/tags.h>
			
 
				-
			
 
				-#define _STARPU_HTBL_NODE_SIZE	8
			
 
				-
			
 
				-struct _starpu_htbl_node
			
 
				-{
			
 
				-	unsigned nentries;
			
 
				-	struct _starpu_htbl_node *children[1<<_STARPU_HTBL_NODE_SIZE];
			
 
				-};
			
 
				-
			
 
				-void *_starpu_htbl_search_tag(struct _starpu_htbl_node *htbl, starpu_tag_t tag);
			
 
				-void *_starpu_htbl_insert_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag, void *entry);
			
 
				-void *_starpu_htbl_remove_tag(struct _starpu_htbl_node **htbl, starpu_tag_t tag);
			
 
				-void _starpu_htbl_clear_tags(struct _starpu_htbl_node **htbl, unsigned level, void (*free_entry)(void*));
			
 
				-
			
 
				-
			
 
				-#endif
			
--- a/src/core/dependencies/tags.c
+++ b/src/core/dependencies/tags.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -19,13 +19,23 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/dependencies/tags.h>
			
 
				-#include <core/dependencies/htable.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <profiling/bound.h>
			
 
				+#include <common/uthash.h>
			
 
				 
			
 
				-static struct _starpu_htbl_node *tag_htbl = NULL;
			
 
				+struct _starpu_tag_table
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	starpu_tag_t id;
			
 
				+	struct _starpu_tag *tag;
			
 
				+};
			
 
				+
			
 
				+#define HASH_ADD_UINT64_T(head,field,add) HASH_ADD(hh,head,field,sizeof(uint64_t),add)
			
 
				+#define HASH_FIND_UINT64_T(head,find,out) HASH_FIND(hh,head,find,sizeof(uint64_t),out)
			
 
				+
			
 
				+static struct _starpu_tag_table *tag_htbl = NULL;
			
 
				 static pthread_rwlock_t tag_global_rwlock = PTHREAD_RWLOCK_INITIALIZER;
			
 
				 
			
 
				 static struct _starpu_cg *create_cg_apps(unsigned ntags)
			
@@ -114,15 +124,16 @@ static void _starpu_tag_free(void *_tag)
 
				 
			
 
				 void starpu_tag_remove(starpu_tag_t id)
			
 
				 {
			
 
				-	struct _starpu_tag *tag;
			
 
				+	struct _starpu_tag_table *entry;
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				 
			
 
				-	tag = (struct _starpu_tag *) _starpu_htbl_remove_tag(&tag_htbl, id);
			
 
				+	HASH_FIND_UINT64_T(tag_htbl, &id, entry);
			
 
				+	if (entry) HASH_DEL(tag_htbl, entry);
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 
			
 
				-	_starpu_tag_free(tag);
			
 
				+	if (entry)_starpu_tag_free(entry->tag);
			
 
				 }
			
 
				 
			
 
				 void _starpu_tag_clear(void)
			
@@ -133,7 +144,13 @@ void _starpu_tag_clear(void)
 
				 	 * the global rwlock. This contradicts the lock order of
			
 
				 	 * starpu_tag_wait_array. Should not be a problem in practice since
			
 
				 	 * _starpu_tag_clear is called at shutdown only. */
			
 
				-	_starpu_htbl_clear_tags(&tag_htbl, 0, _starpu_tag_free);
			
 
				+	struct _starpu_tag_table *entry, *tmp;
			
 
				+
			
 
				+	HASH_ITER(hh, tag_htbl, entry, tmp)
			
 
				+	{
			
 
				+		HASH_DEL(tag_htbl, entry);
			
 
				+		_starpu_tag_free(entry->tag);
			
 
				+	}
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
 
				 }
			
@@ -143,18 +160,24 @@ static struct _starpu_tag *gettag_struct(starpu_tag_t id)
 
				 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&tag_global_rwlock);
			
 
				 
			
 
				 	/* search if the tag is already declared or not */
			
 
				+	struct _starpu_tag_table *entry;
			
 
				 	struct _starpu_tag *tag;
			
 
				-	tag = (struct _starpu_tag *) _starpu_htbl_search_tag(tag_htbl, id);
			
 
				 
			
 
				-	if (tag == NULL)
			
 
				+	HASH_FIND_UINT64_T(tag_htbl, &id, entry);
			
 
				+	if (entry != NULL)
			
 
				+	     tag = entry->tag;
			
 
				+	else
			
 
				 	{
			
 
				 		/* the tag does not exist yet : create an entry */
			
 
				 		tag = _starpu_tag_init(id);
			
 
				 
			
 
				-		void *old;
			
 
				-		old = _starpu_htbl_insert_tag(&tag_htbl, id, tag);
			
 
				-		/* there was no such tag before */
			
 
				-		STARPU_ASSERT(old == NULL);
			
 
				+		struct _starpu_tag_table *entry2;
			
 
				+		entry2 = (struct _starpu_tag_table *) malloc(sizeof(*entry2));
			
 
				+		STARPU_ASSERT(entry2 != NULL);
			
 
				+		entry2->id = id;
			
 
				+		entry2->tag = tag;
			
 
				+
			
 
				+		HASH_ADD_UINT64_T(tag_htbl, id, entry2);
			
 
				 	}
			
 
				 
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&tag_global_rwlock);
			
--- a/src/core/dependencies/task_deps.c
+++ b/src/core/dependencies/task_deps.c
@@ -19,7 +19,6 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <core/dependencies/tags.h>
			
 
				-#include <core/dependencies/htable.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <core/task.h>
			
 
				 #include <core/sched_policy.h>
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -22,7 +22,6 @@
 
				 #include <common/config.h>
			
 
				 #include <starpu.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				-#include <common/htable32.h>
			
 
				 #include <core/task_bundle.h>
			
 
				 #include <pthread.h>
			
 
				 #include <stdio.h>
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -44,7 +44,7 @@
 
				 #define SIZE	(32*1024*1024*sizeof(char))
			
 
				 #define NITER	128
			
 
				 
			
 
				-#define MAXCPUS	32
			
 
				+static void starpu_force_bus_sampling(void);
			
 
				 
			
 
				 /* timing is in Âµs per byte (i.e. slowness, inverse of bandwidth) */
			
 
				 struct dev_timing
			
@@ -64,19 +64,19 @@ static int nopencl = 0;
 
				 /* Benchmarking the performance of the bus */
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
			
 
				+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
			
 
				 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
			
 
				 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
			
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
			
 
				 #endif
			
 
				-static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
			
 
				+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
			
 
				+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
			
 
				 static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
			
 
				 static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
			
 
				-static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
			
 
				+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
			
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
@@ -86,6 +86,7 @@ static hwloc_topology_t hwtopology;
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+
			
 
				 static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
			
 
				 {
			
 
				 	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
@@ -93,11 +94,11 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	size_t size = SIZE;
			
 
				 
			
 
				 	/* Initialize CUDA context on the device */
			
 
				-	starpu_cuda_set_device(dev);
			
 
				+	cudaSetDevice(dev);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				-
			
 
				+	
			
 
				 	/* hack to force the initialization */
			
 
				 	cudaFree(0);
			
 
				 
			
@@ -149,7 +150,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
			
 
				+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
			
 
				 
			
 
				 	/* Measure download bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
@@ -161,7 +162,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
			
 
				+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
			
 
				 
			
 
				 	/* Free buffers */
			
 
				 	cudaFreeHost(h_buffer);
			
@@ -174,10 +175,12 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 
				 static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
			
 
				 {
			
 
				 	size_t size = SIZE;
			
 
				+	int can;
			
 
				 
			
 
				         /* Get the maximum size which can be allocated on the device */
			
 
				 	struct cudaDeviceProp prop;
			
 
				 	cudaError_t cures;
			
 
				+
			
 
				 	cures = cudaGetDeviceProperties(&prop, src);
			
 
				 	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
			
@@ -186,7 +189,14 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				         if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
			
 
				 
			
 
				 	/* Initialize CUDA context on the source */
			
 
				-	starpu_cuda_set_device(src);
			
 
				+	cudaSetDevice(src);
			
 
				+
			
 
				+	cures = cudaDeviceCanAccessPeer(&can, src, dst);
			
 
				+	if (!cures && can) {
			
 
				+		cures = cudaDeviceEnablePeerAccess(dst, 0);
			
 
				+		if (!cures)
			
 
				+			_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
			
 
				+	}
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *s_buffer;
			
@@ -195,7 +205,14 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 
			
 
				 	/* Initialize CUDA context on the destination */
			
 
				-	starpu_cuda_set_device(dst);
			
 
				+	cudaSetDevice(dst);
			
 
				+
			
 
				+	cures = cudaDeviceCanAccessPeer(&can, dst, src);
			
 
				+	if (!cures && can) {
			
 
				+		cures = cudaDeviceEnablePeerAccess(src, 0);
			
 
				+		if (!cures)
			
 
				+			_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
			
 
				+	}
			
 
				 
			
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
@@ -222,7 +239,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 
			
 
				 	/* Free buffers */
			
 
				 	cudaFree(d_buffer);
			
 
				-	starpu_cuda_set_device(src);
			
 
				+	cudaSetDevice(src);
			
 
				 	cudaFree(s_buffer);
			
 
				 
			
 
				 	cudaThreadExit();
			
@@ -237,12 +254,18 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 
				         cl_command_queue queue;
			
 
				         cl_int err=0;
			
 
				 	size_t size = SIZE;
			
 
				+	int not_initialized;
			
 
				 
			
 
				         struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
 
				 
			
 
				-	/* Initialize OpenCL context on the device */
			
 
				-        _starpu_opencl_init_context(dev);
			
 
				+	/* Is the context already initialised ? */
			
 
				+        starpu_opencl_get_context(dev, &context);
			
 
				+	not_initialized = (context == NULL);
			
 
				+	if (not_initialized == 1)
			
 
				+	     _starpu_opencl_init_context(dev);
			
 
				+
			
 
				+	/* Get context and queue */
			
 
				         starpu_opencl_get_context(dev, &context);
			
 
				         starpu_opencl_get_queue(dev, &queue);
			
 
				 
			
@@ -300,7 +323,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
			
 
				+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
			
 
				 
			
 
				 	/* Measure download bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
@@ -312,14 +335,15 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 
				 	gettimeofday(&end, NULL);
			
 
				 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				 
			
 
				-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
			
 
				+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
			
 
				 
			
 
				 	/* Free buffers */
			
 
				 	clReleaseMemObject(d_buffer);
			
 
				 	free(h_buffer);
			
 
				 
			
 
				 	/* Uninitiliaze OpenCL context on the device */
			
 
				-        _starpu_opencl_deinit_context(dev);
			
 
				+	if (not_initialized == 1)
			
 
				+	     _starpu_opencl_deinit_context(dev);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -334,11 +358,11 @@ static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_
 
				 	double right_dtoh = right->timing_dtoh;
			
 
				 	double right_htod = right->timing_htod;
			
 
				 
			
 
				-	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
			
 
				-	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
			
 
				+	double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
			
 
				+	double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
			
 
				 
			
 
				 	/* it's for a decreasing sorting */
			
 
				-	return (bandwidth_sum2_left < bandwidth_sum2_right);
			
 
				+	return (timing_sum2_left > timing_sum2_right);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
@@ -399,7 +423,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 
				 	unsigned cpu;
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				-		dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
			
 
				+		dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 		int numa_id = 0;
			
@@ -413,9 +437,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 
				 			if (is_available_per_numa_node[numa_id])
			
 
				 			{
			
 
				 				/* We reuse the previous numbers for that NUMA node */
			
 
				-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
			
 
				+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
			
 
				 					dev_timing_htod_per_numa_node[numa_id];
			
 
				-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
			
 
				+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
			
 
				 					dev_timing_dtoh_per_numa_node[numa_id];
			
 
				 				continue;
			
 
				 			}
			
@@ -436,9 +460,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 
				 		{
			
 
				 			/* Save the results for that NUMA node */
			
 
				 			dev_timing_htod_per_numa_node[numa_id] =
			
 
				-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
			
 
				+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
			
 
				 			dev_timing_dtoh_per_numa_node[numa_id] =
			
 
				-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
			
 
				+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
			
 
				 
			
 
				 			is_available_per_numa_node[numa_id] = 1;
			
 
				 		}
			
@@ -461,35 +485,32 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 
				 	measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
			
 
				 
			
 
				 	/* sort the results */
			
 
				-	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
			
 
				+	qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
			
 
				               sizeof(struct dev_timing),
			
 
				 			compar_dev_timing);
			
 
				 
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#  warning save timing_dtoh and timing_htod data to display them when calling starpu_machine_display ? (Brice would like that)
			
 
				-#endif
			
 
				 #ifdef STARPU_VERBOSE
			
 
				         unsigned cpu;
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				-		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
			
 
				-		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
			
 
				-		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
			
 
				+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
			
 
				+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
			
 
				+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
			
 
				 
			
 
				 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
			
 
				 
			
 
				 		_STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				 	}
			
 
				 
			
 
				-	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
			
 
				+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
			
 
				 
			
 
				 	_STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
			
 
				 #endif
			
 
				 
			
 
				 	/* The results are sorted in a decreasing order, so that the best
			
 
				 	 * measurement is currently the first entry. */
			
 
				-	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
			
 
				-	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
			
 
				+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
			
 
				+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
			
 
				 }
			
 
				 #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
			
 
				 
			
@@ -670,7 +691,7 @@ static void load_bus_affinity_file_content(void)
 
				 #endif /* !STARPU_USE_OPENCL */
			
 
				 
			
 
				 	fclose(f);
			
 
				-#endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
			
 
				+#endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
			
 
				 
			
 
				 }
			
 
				 
			
@@ -706,7 +727,7 @@ static void write_bus_affinity_file_content(void)
 
				 
			
 
				 		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 		{
			
 
				-			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
			
 
				+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -719,7 +740,7 @@ static void write_bus_affinity_file_content(void)
 
				 
			
 
				 		for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 		{
			
 
				-                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
			
 
				+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -774,14 +795,10 @@ void starpu_bus_print_affinity(FILE *f)
 
				 	unsigned cpu;
			
 
				 	int gpu;
			
 
				 
			
 
				-	fprintf(f, "# GPU\t");
			
 
				-	for(cpu = 0 ; cpu < ncpus ; cpu++)
			
 
				-	{
			
 
				-		fprintf(f, "CPU%u\t", cpu);
			
 
				-	}
			
 
				-	fprintf(f, "\n");
			
 
				+	fprintf(f, "# GPU\tCPU in preference order (logical index)\n");
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				+	fprintf(f, "# CUDA\n");
			
 
				 	for(gpu = 0 ; gpu<ncuda ; gpu++)
			
 
				 	{
			
 
				 		fprintf(f, "%d\t", gpu);
			
@@ -793,6 +810,7 @@ void starpu_bus_print_affinity(FILE *f)
 
				 	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				+	fprintf(f, "# OpenCL\n");
			
 
				 	for(gpu = 0 ; gpu<nopencl ; gpu++)
			
 
				 	{
			
 
				 		fprintf(f, "%d\t", gpu);
			
@@ -1090,12 +1108,12 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				         maxnode += nopencl;
			
 
				 #endif
			
 
				 
			
 
				-	fprintf(f, "from\t");
			
 
				-	fprintf(f, "to RAM\t\t");
			
 
				+	fprintf(f, "from to\t");
			
 
				+	fprintf(f, "RAM\t");
			
 
				 	for (dst = 0; dst < ncuda; dst++)
			
 
				-		fprintf(f, "to CUDA %d\t", dst);
			
 
				+		fprintf(f, "CUDA %d\t", dst);
			
 
				 	for (dst = 0; dst < nopencl; dst++)
			
 
				-		fprintf(f, "to OpenCL %d\t", dst);
			
 
				+		fprintf(f, "OpenCL %d\t", dst);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				 	for (src = 0; src <= maxnode; src++)
			
@@ -1107,10 +1125,45 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 		else
			
 
				 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
			
 
				 		for (dst = 0; dst <= maxnode; dst++)
			
 
				-			fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
			
 
				+			fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
			
 
				 
			
 
				 		fprintf(f, "\n");
			
 
				 	}
			
 
				+
			
 
				+	fprintf(f, "\nGPU\tCPU in preference order (logical index), host-to-device, device-to-host\n");
			
 
				+	for (src = 1; src <= maxnode; src++)
			
 
				+	{
			
 
				+		struct dev_timing *timing;
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		int ncpus = _starpu_topology_get_nhwcpu(config);
			
 
				+		int cpu;
			
 
				+
			
 
				+		if (src <= ncuda)
			
 
				+		{
			
 
				+			fprintf(f, "CUDA %d\t", src-1);
			
 
				+			for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+			{
			
 
				+				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
			
 
				+				if (timing->timing_htod)
			
 
				+					fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
			
 
				+				else
			
 
				+					fprintf(f, "%d\t", cuda_affinity_matrix[src-1][cpu]);
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			fprintf(f, "OpenCL%d\t", src-ncuda-1);
			
 
				+			for (cpu = 0; cpu < ncpus; cpu++)
			
 
				+			{
			
 
				+				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
			
 
				+				if (timing->timing_htod)
			
 
				+					fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
			
 
				+				else
			
 
				+					fprintf(f, "%d\t", opencl_affinity_matrix[src-1][cpu]);
			
 
				+			}
			
 
				+		}
			
 
				+		fprintf(f, "\n");
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void generate_bus_bandwidth_file(void)
			
@@ -1148,21 +1201,23 @@ static void check_bus_config_file()
 
				 {
			
 
				         int res;
			
 
				         char path[256];
			
 
				+        struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				         get_config_path(path, 256);
			
 
				         res = access(path, F_OK);
			
 
				-        if (res)
			
 
				+	if (res || config->conf->bus_calibrate > 0)
			
 
				 	{
			
 
				-		_STARPU_DISP("No performance model for the bus, calibrating...\n");
			
 
				+		if (res)
			
 
				+			_STARPU_DISP("No performance model for the bus, calibrating...\n");
			
 
				 		starpu_force_bus_sampling();
			
 
				-		_STARPU_DISP("... done\n");
			
 
				+		if (res)
			
 
				+			_STARPU_DISP("... done\n");
			
 
				         }
			
 
				         else
			
 
				 	{
			
 
				                 FILE *f;
			
 
				                 int ret, read_cuda = -1, read_opencl = -1;
			
 
				                 unsigned read_cpus = -1;
			
 
				-                struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				                 // Loading configuration from file
			
 
				                 f = fopen(path, "r");
			
@@ -1240,7 +1295,7 @@ static void generate_bus_config_file()
 
				  *	Generic
			
 
				  */
			
 
				 
			
 
				-void starpu_force_bus_sampling(void)
			
 
				+static void starpu_force_bus_sampling(void)
			
 
				 {
			
 
				 	_starpu_create_sampling_directory_if_needed();
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -30,11 +30,22 @@
 
				 #include <core/perfmodel/regression.h>
			
 
				 #include <common/config.h>
			
 
				 #include <starpu_parameters.h>
			
 
				+#include <common/uthash.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_WINDOWS
			
 
				 #include <windows.h>
			
 
				 #endif
			
 
				 
			
 
				+#define HASH_ADD_UINT32_T(head,field,add) HASH_ADD(hh,head,field,sizeof(uint32_t),add)
			
 
				+#define HASH_FIND_UINT32_T(head,find,out) HASH_FIND(hh,head,find,sizeof(uint32_t),out)
			
 
				+
			
 
				+struct starpu_history_table
			
 
				+{
			
 
				+	UT_hash_handle hh;
			
 
				+	uint32_t footprint;
			
 
				+	struct starpu_history_entry *history_entry;
			
 
				+};
			
 
				+
			
 
				 /* We want more than 10% variance on X to trust regression */
			
 
				 #define VALID_REGRESSION(reg_model) \
			
 
				 	((reg_model)->minx < (9*(reg_model)->maxx)/10 && (reg_model)->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
@@ -45,19 +56,25 @@ static struct starpu_model_list *registered_models = NULL;
 
				 /*
			
 
				  * History based model
			
 
				  */
			
 
				-static void insert_history_entry(struct starpu_history_entry *entry, struct starpu_history_list **list, struct starpu_htbl32_node **history_ptr)
			
 
				+static void insert_history_entry(struct starpu_history_entry *entry, struct starpu_history_list **list, struct starpu_history_table **history_ptr)
			
 
				 {
			
 
				 	struct starpu_history_list *link;
			
 
				-	struct starpu_history_entry *old;
			
 
				+	struct starpu_history_table *table;
			
 
				 
			
 
				 	link = (struct starpu_history_list *) malloc(sizeof(struct starpu_history_list));
			
 
				 	link->next = *list;
			
 
				 	link->entry = entry;
			
 
				 	*list = link;
			
 
				 
			
 
				-	old = (struct starpu_history_entry *) _starpu_htbl_insert_32(history_ptr, entry->footprint, entry);
			
 
				-	/* that may fail in case there is some concurrency issue */
			
 
				-	STARPU_ASSERT(old == NULL);
			
 
				+	/* detect concurrency issue */
			
 
				+	//HASH_FIND_UINT32_T(*history_ptr, &entry->footprint, table);
			
 
				+	//STARPU_ASSERT(table == NULL);
			
 
				+
			
 
				+	table = (struct starpu_history_table*) malloc(sizeof(*table));
			
 
				+	STARPU_ASSERT(table != NULL);
			
 
				+	table->footprint = entry->footprint;
			
 
				+	table->history_entry = entry;
			
 
				+	HASH_ADD_UINT32_T(*history_ptr, footprint, table);
			
 
				 }
			
 
				 
			
 
				 static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, unsigned arch, unsigned nimpl)
			
@@ -524,6 +541,9 @@ static void initialize_per_arch_model(struct starpu_per_arch_perfmodel *per_arch
 
				 {
			
 
				 	per_arch_model->history = NULL;
			
 
				 	per_arch_model->list = NULL;
			
 
				+	per_arch_model->regression.nsample = 0;
			
 
				+	per_arch_model->regression.valid = 0;
			
 
				+	per_arch_model->regression.nl_valid = 0;
			
 
				 }
			
 
				 
			
 
				 static void initialize_model(struct starpu_perfmodel *model)
			
@@ -699,8 +719,15 @@ void _starpu_deinitialize_registered_performance_models(void)
 
				 			{
			
 
				 				struct starpu_per_arch_perfmodel *archmodel = &model->per_arch[arch][nimpl];
			
 
				 				struct starpu_history_list *list, *plist;
			
 
				-				_starpu_htbl_destroy_32(archmodel->history, NULL);
			
 
				+				struct starpu_history_table *entry, *tmp;
			
 
				+
			
 
				+				HASH_ITER(hh, archmodel->history, entry, tmp)
			
 
				+				{
			
 
				+					HASH_DEL(archmodel->history, entry);
			
 
				+					free(entry);
			
 
				+				}
			
 
				 				archmodel->history = NULL;
			
 
				+
			
 
				 				list = archmodel->list;
			
 
				 				while (list) {
			
 
				 					free(list->entry);
			
@@ -852,6 +879,7 @@ int starpu_list_models(FILE *output)
 
				 int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	model->symbol = strdup(symbol);
			
 
				+	initialize_model(model);
			
 
				 
			
 
				 	/* where is the file if it exists ? */
			
 
				 	char path[256];
			
@@ -966,16 +994,16 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 
				 	{
			
 
				 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				 		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
			
 
				-		struct starpu_htbl32_node *history;
			
 
				-		struct starpu_history_entry *entry;
			
 
				+		struct starpu_history_table *history;
			
 
				+		struct starpu_history_table *entry;
			
 
				 
			
 
				 		_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
			
 
				 		history = per_arch_model->history;
			
 
				-		entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
			
 
				+		HASH_FIND_UINT32_T(history, &key, entry);
			
 
				 		_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				-		if (entry && entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
 
				-			exp = entry->mean;
			
 
				+		if (entry && entry->history_entry && entry->history_entry->nsample >= _STARPU_CALIBRATION_MINIMUM)
			
 
				+			exp = entry->history_entry->mean;
			
 
				 		else if (!model->benchmarking)
			
 
				 		{
			
 
				 			_STARPU_DISP("Warning: model %s is not calibrated enough, forcing calibration for this run. Use the STARPU_CALIBRATE environment variable to control this.\n", model->symbol);
			
@@ -992,7 +1020,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
				 	double exp;
			
 
				 	struct starpu_per_arch_perfmodel *per_arch_model;
			
 
				 	struct starpu_history_entry *entry;
			
 
				-	struct starpu_htbl32_node *history;
			
 
				+	struct starpu_history_table *history, *elt;
			
 
				 
			
 
				 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				 
			
@@ -1005,7 +1033,8 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, e
 
				 		return NAN;
			
 
				 	}
			
 
				 
			
 
				-	entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
			
 
				+	HASH_FIND_UINT32_T(history, &key, elt);
			
 
				+	entry = (elt == NULL) ? NULL : elt->history_entry;
			
 
				 	_STARPU_PTHREAD_RWLOCK_UNLOCK(&model->model_rwlock);
			
 
				 
			
 
				 	exp = entry?entry->mean:NAN;
			
@@ -1053,35 +1082,32 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				 		{
			
 
				 			struct starpu_history_entry *entry;
			
 
				-			struct starpu_htbl32_node *history;
			
 
				-			struct starpu_htbl32_node **history_ptr;
			
 
				+			struct starpu_history_table *elt;
			
 
				 			struct starpu_history_list **list;
			
 
				 			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
			
 
				 
			
 
				-			history = per_arch_model->history;
			
 
				-			history_ptr = &per_arch_model->history;
			
 
				 			list = &per_arch_model->list;
			
 
				 
			
 
				-			entry = (struct starpu_history_entry *) _starpu_htbl_search_32(history, key);
			
 
				+			HASH_FIND_UINT32_T(per_arch_model->history, &key, elt);
			
 
				+			entry = (elt == NULL) ? NULL : elt->history_entry;
			
 
				 
			
 
				 			if (!entry)
			
 
				 			{
			
 
				 				/* this is the first entry with such a footprint */
			
 
				 				entry = (struct starpu_history_entry *) malloc(sizeof(struct starpu_history_entry));
			
 
				 				STARPU_ASSERT(entry);
			
 
				-					entry->mean = measured;
			
 
				-					entry->sum = measured;
			
 
				-
			
 
				-					entry->deviation = 0.0;
			
 
				-					entry->sum2 = measured*measured;
			
 
				+				entry->mean = measured;
			
 
				+				entry->sum = measured;
			
 
				 
			
 
				-					entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				+				entry->deviation = 0.0;
			
 
				+				entry->sum2 = measured*measured;
			
 
				 
			
 
				-					entry->footprint = key;
			
 
				-					entry->nsample = 1;
			
 
				+				entry->size = _starpu_job_get_data_size(model, arch, nimpl, j);
			
 
				 
			
 
				-				insert_history_entry(entry, list, history_ptr);
			
 
				+				entry->footprint = key;
			
 
				+				entry->nsample = 1;
			
 
				 
			
 
				+				insert_history_entry(entry, list, &per_arch_model->history);
			
 
				 			}
			
 
				 			else
			
 
				 			{
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -48,14 +48,14 @@ static unsigned topology_is_initialized = 0;
 
				 static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *config);
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
			
 
				-#  ifdef STARPU_USE_CUDA
			
 
				-static void _starpu_initialize_workers_cuda_gpuid(struct _starpu_machine_config *config);
			
 
				-/* Entry in the `devices_using_cuda' hash table.  */
			
 
				 struct handle_entry
			
 
				 {
			
 
				 	UT_hash_handle hh;
			
 
				 	unsigned gpuid;
			
 
				 };
			
 
				+#  ifdef STARPU_USE_CUDA
			
 
				+static void _starpu_initialize_workers_cuda_gpuid(struct _starpu_machine_config *config);
			
 
				+/* Entry in the `devices_using_cuda' hash table.  */
			
 
				 static struct handle_entry *devices_using_cuda;
			
 
				 #  endif
			
 
				 #  ifdef STARPU_USE_OPENCL
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -484,10 +484,14 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
			
 
				 	conf->nspus = starpu_get_env_number("STARPU_NGORDON");
			
 
				 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
			
 
				+	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
			
 
				 
			
 
				 	if (conf->calibrate == -1)
			
 
				 	     conf->calibrate = 0;
			
 
				 
			
 
				+	if (conf->bus_calibrate == -1)
			
 
				+	     conf->bus_calibrate = 0;
			
 
				+
			
 
				 	conf->use_explicit_workers_bindid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
			
@@ -527,6 +531,7 @@ static void _starpu_conf_check_environment(struct starpu_conf *conf)
 
				 	_starpu_conf_set_value_against_environment("STARPU_NOPENCL", &conf->nopencl);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_NGORDON", &conf->nspus);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_CALIBRATE", &conf->calibrate);
			
 
				+	_starpu_conf_set_value_against_environment("STARPU_BUS_CALIBRATE", &conf->bus_calibrate);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_SINGLE_COMBINED_WORKER", &conf->single_combined_worker);
			
 
				 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_COPY", &conf->disable_asynchronous_copy);
			
 
				 }
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -586,19 +586,16 @@ void starpu_data_unregister_no_coherency(starpu_data_handle_t handle)
 
				 	_starpu_data_unregister(handle, 0);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_unregister_lazy(starpu_data_handle_t handle) {
			
 
				+void starpu_data_unregister_submit(starpu_data_handle_t handle) {
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 	handle->lazy_unregister = 1;
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 	_starpu_data_unregister(handle, 0);
			
 
				 }
			
 
				 
			
 
				-void starpu_data_invalidate(starpu_data_handle_t handle)
			
 
				+static void _starpu_data_invalidate(void *data)
			
 
				 {
			
 
				-	STARPU_ASSERT(handle);
			
 
				-
			
 
				-	starpu_data_acquire(handle, STARPU_W);
			
 
				-
			
 
				+	starpu_data_handle_t handle = data;
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				 	unsigned node;
			
@@ -620,6 +617,22 @@ void starpu_data_invalidate(starpu_data_handle_t handle)
 
				 	starpu_data_release(handle);
			
 
				 }
			
 
				 
			
 
				+void starpu_data_invalidate(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	STARPU_ASSERT(handle);
			
 
				+
			
 
				+	starpu_data_acquire(handle, STARPU_W);
			
 
				+
			
 
				+	_starpu_data_invalidate(handle);
			
 
				+}
			
 
				+
			
 
				+void starpu_data_invalidate_submit(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	STARPU_ASSERT(handle);
			
 
				+
			
 
				+	starpu_data_acquire_cb(handle, STARPU_W, _starpu_data_invalidate, handle);
			
 
				+}
			
 
				+
			
 
				 enum starpu_data_interface_id starpu_handle_get_interface_id(starpu_data_handle_t handle)
			
 
				 {
			
 
				 	return handle->ops->interfaceid;
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -91,9 +91,9 @@ static void _starpu_data_acquire_continuation_non_blocking(void *arg)
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				+	struct _starpu_data_replicate *replicate = &handle->per_node[wrapper->node];
			
 
				 
			
 
				-	ret = _starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 1,
			
 
				+	ret = _starpu_fetch_data_on_node(handle, replicate, wrapper->mode, 0, 1,
			
 
				 					 _starpu_data_acquire_fetch_data_callback, wrapper);
			
 
				 	STARPU_ASSERT(!ret);
			
 
				 }
			
@@ -114,7 +114,7 @@ static void starpu_data_acquire_cb_pre_sync_callback(void *arg)
 
				 }
			
 
				 
			
 
				 /* The data must be released by calling starpu_data_release later on */
			
 
				-int starpu_data_acquire_cb(starpu_data_handle_t handle,
			
 
				+int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node,
			
 
				 			   enum starpu_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
@@ -125,6 +125,7 @@ int starpu_data_acquire_cb(starpu_data_handle_t handle,
 
				 	STARPU_ASSERT(wrapper);
			
 
				 
			
 
				 	wrapper->handle = handle;
			
 
				+	wrapper->node = node;
			
 
				 	wrapper->mode = mode;
			
 
				 	wrapper->callback = callback;
			
 
				 	wrapper->callback_arg = arg;
			
@@ -175,6 +176,12 @@ int starpu_data_acquire_cb(starpu_data_handle_t handle,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int starpu_data_acquire_cb(starpu_data_handle_t handle,
			
 
				+			   enum starpu_access_mode mode, void (*callback)(void *), void *arg)
			
 
				+{
			
 
				+	return starpu_data_acquire_on_node_cb(handle, 0, mode, callback, arg);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  *	Block data request from application
			
 
				  */
			
@@ -186,9 +193,9 @@ static inline void _starpu_data_acquire_continuation(void *arg)
 
				 
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				-	struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				+	struct _starpu_data_replicate *replicate = &handle->per_node[wrapper->node];
			
 
				 
			
 
				-	_starpu_fetch_data_on_node(handle, ram_replicate, wrapper->mode, 0, 0, NULL, NULL);
			
 
				+	_starpu_fetch_data_on_node(handle, replicate, wrapper->mode, 0, 0, NULL, NULL);
			
 
				 
			
 
				 	/* continuation of starpu_data_acquire */
			
 
				 	_STARPU_PTHREAD_MUTEX_LOCK(&wrapper->lock);
			
@@ -198,7 +205,7 @@ static inline void _starpu_data_acquire_continuation(void *arg)
 
				 }
			
 
				 
			
 
				 /* The data must be released by calling starpu_data_release later on */
			
 
				-int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode)
			
 
				+int starpu_data_acquire_on_node(starpu_data_handle_t handle, unsigned node, enum starpu_access_mode mode)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 	STARPU_ASSERT_MSG(handle->nchildren == 0, "Acquiring a partitioned data is not possible");
			
@@ -230,7 +237,7 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mod
 
				 	{
			
 
				 		.handle = handle,
			
 
				 		.mode = mode,
			
 
				-		.node = 0, // unused
			
 
				+		.node = node,
			
 
				 		.cond = PTHREAD_COND_INITIALIZER,
			
 
				 		.lock = PTHREAD_MUTEX_INITIALIZER,
			
 
				 		.finished = 0
			
@@ -278,8 +285,8 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mod
 
				 	if (!_starpu_attempt_to_submit_data_request_from_apps(handle, mode, _starpu_data_acquire_continuation, &wrapper))
			
 
				 	{
			
 
				 		/* no one has locked this data yet, so we proceed immediately */
			
 
				-		struct _starpu_data_replicate *ram_replicate = &handle->per_node[0];
			
 
				-		int ret = _starpu_fetch_data_on_node(handle, ram_replicate, mode, 0, 0, NULL, NULL);
			
 
				+		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				+		int ret = _starpu_fetch_data_on_node(handle, replicate, mode, 0, 0, NULL, NULL);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				 	else
			
@@ -301,19 +308,29 @@ int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mod
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_access_mode mode)
			
 
				+{
			
 
				+	return starpu_data_acquire_on_node(handle, 0, mode);
			
 
				+}
			
 
				+
			
 
				 /* This function must be called after starpu_data_acquire so that the
			
 
				  * application release the data */
			
 
				-void starpu_data_release(starpu_data_handle_t handle)
			
 
				+void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
 
				 	/* The application can now release the rw-lock */
			
 
				-	_starpu_release_data_on_node(handle, 0, &handle->per_node[0]);
			
 
				+	_starpu_release_data_on_node(handle, 0, &handle->per_node[node]);
			
 
				 
			
 
				 	/* In case there are some implicit dependencies, unlock the "post sync" tasks */
			
 
				 	_starpu_unlock_post_sync_tasks(handle);
			
 
				 }
			
 
				 
			
 
				+void starpu_data_release(starpu_data_handle_t handle)
			
 
				+{
			
 
				+	starpu_data_release_on_node(handle, 0);
			
 
				+}
			
 
				+
			
 
				 static void _prefetch_data_on_node(void *arg)
			
 
				 {
			
 
				 	struct user_interaction_wrapper *wrapper = (struct user_interaction_wrapper *) arg;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -140,10 +140,26 @@ done:
 
				 static void init_context(int devid)
			
 
				 {
			
 
				 	cudaError_t cures;
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				+	int workerid;
			
 
				 
			
 
				 	starpu_cuda_set_device(devid);
			
 
				 
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+	int nworkers = starpu_worker_get_count();
			
 
				+	for (workerid = 0; workerid < nworkers; workerid++) {
			
 
				+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+		if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid) {
			
 
				+			int can;
			
 
				+			cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
			
 
				+			if (!cures && can) {
			
 
				+				cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				+				if (cures)
			
 
				+					_STARPU_DEBUG("GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	/* force CUDA to initialize the context for real */
			
 
				 	cures = cudaFree(0);
			
 
				 	if (STARPU_UNLIKELY(cures)) {
			
@@ -166,6 +182,8 @@ static void init_context(int devid)
 
				 
			
 
				 	limit_gpu_mem_if_needed(devid);
			
 
				 
			
 
				+	workerid = starpu_worker_get_id();
			
 
				+
			
 
				 	cures = cudaStreamCreate(&streams[workerid]);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -351,6 +351,10 @@ void _starpu_opencl_init(void)
 
				 						platform_valid = 0;
			
 
				 					}
			
 
				 				}
			
 
				+				if(strcmp(name, "StarPU Platform") == 0) {
			
 
				+					platform_valid = 0;
			
 
				+					_STARPU_DEBUG("Skipping StarPU's SOCL Platform\n");
			
 
				+				}
			
 
				 #ifdef STARPU_VERBOSE
			
 
				 				if (platform_valid)
			
 
				 					_STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
			
--- a/src/util/starpu_insert_task.c
+++ b/src/util/starpu_insert_task.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -78,7 +78,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 
			
 
				 	va_start(varg_list, cl);
			
 
				         struct starpu_task *task = starpu_task_create();
			
 
				-	int ret = _starpu_insert_task_create_and_submit(arg_buffer, cl, &task, varg_list);
			
 
				+	int ret = _starpu_insert_task_create_and_submit(arg_buffer, arg_buffer_size, cl, &task, varg_list);
			
 
				 
			
 
				 	if (ret == -ENODEV)
			
 
				 	{
			
@@ -87,3 +87,4 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 
				 	}
			
 
				         return ret;
			
 
				 }
			
 
				+
			
--- a/src/util/starpu_insert_task_utils.c
+++ b/src/util/starpu_insert_task_utils.c
@@ -58,6 +58,11 @@ size_t _starpu_insert_task_get_arg_size(va_list varg_list)
 
				 		{
			
 
				 			(void)va_arg(varg_list, starpu_data_handle_t);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, starpu_data_handle_t*);
			
 
				+			(void)va_arg(varg_list, int);
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_VALUE)
			
 
				 		{
			
 
				 			(void)va_arg(varg_list, void *);
			
@@ -116,6 +121,11 @@ int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list
 
				 		{
			
 
				 			(void)va_arg(varg_list, starpu_data_handle_t);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, starpu_data_handle_t*);
			
 
				+			(void)va_arg(varg_list, int);
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_VALUE)
			
 
				 		{
			
 
				 			/* We have a constant value: this should be followed by a pointer to the cst value and the size of the constant */
			
@@ -172,7 +182,7 @@ int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int _starpu_insert_task_create_and_submit(char *arg_buffer, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list)
			
 
				+int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list)
			
 
				 {
			
 
				         int arg_type;
			
 
				 	unsigned current_buffer = 0;
			
@@ -203,6 +213,20 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, struct starpu_codele
 
				 
			
 
				 			current_buffer++;
			
 
				 		}
			
 
				+		else if (arg_type == STARPU_DATA_ARRAY)
			
 
				+		{
			
 
				+			// Expect to find a array of handles and its size
			
 
				+			starpu_data_handle_t *handles = va_arg(varg_list, starpu_data_handle_t *);
			
 
				+			int nb_handles = va_arg(varg_list, int);
			
 
				+
			
 
				+			int i;
			
 
				+			for(i=0 ; i<nb_handles ; i++)
			
 
				+			{
			
 
				+				(*task)->handles[current_buffer] = handles[i];
			
 
				+				current_buffer++;
			
 
				+			}
			
 
				+
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_VALUE)
			
 
				 		{
			
 
				 			(void)va_arg(varg_list, void *);
			
@@ -261,6 +285,7 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, struct starpu_codele
 
				 
			
 
				 	(*task)->cl = cl;
			
 
				 	(*task)->cl_arg = arg_buffer;
			
 
				+	(*task)->cl_arg_size = arg_buffer_size;
			
 
				 
			
 
				 	/* The callback will free the argument stack and execute the
			
 
				 	 * application's callback, if any. */
			
@@ -271,7 +296,7 @@ int _starpu_insert_task_create_and_submit(char *arg_buffer, struct starpu_codele
 
				 
			
 
				 	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				 	{
			
 
				-		fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s')\n",
			
 
				+		fprintf(stderr, "submission of task %p wih codelet %p failed (symbol `%s') (err: ENODEV)\n",
			
 
				 			*task, (*task)->cl,
			
 
				 			(*task)->cl->name ? (*task)->cl->name :
			
 
				 			((*task)->cl->model && (*task)->cl->model->symbol)?(*task)->cl->model->symbol:"none");
			
--- a/src/util/starpu_insert_task_utils.h
+++ b/src/util/starpu_insert_task_utils.h
@@ -23,7 +23,8 @@
 
				 
			
 
				 size_t _starpu_insert_task_get_arg_size(va_list varg_list);
			
 
				 int _starpu_codelet_pack_args(size_t arg_buffer_size, char **arg_buffer, va_list varg_list);
			
 
				-int _starpu_insert_task_create_and_submit(char *arg_buffer, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
			
 
				+int _starpu_insert_task_create_and_submit(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, va_list varg_list);
			
 
				+int _starpu_insert_task_create_and_submit_array(char *arg_buffer, size_t arg_buffer_size, struct starpu_codelet *cl, struct starpu_task **task, starpu_data_handle_t *handles, unsigned nb_handles, va_list varg_list);
			
 
				 
			
 
				 #endif // __STARPU_INSERT_TASK_UTILS_H__
			
 
				 
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -120,6 +120,7 @@ noinst_PROGRAMS =				\
 
				 	main/restart				\
			
 
				 	main/execute_on_a_specific_worker	\
			
 
				 	main/insert_task			\
			
 
				+	main/insert_task_array			\
			
 
				 	main/multithreaded			\
			
 
				 	main/multithreaded_init			\
			
 
				 	main/starpu_task_bundle			\
			
@@ -220,6 +221,7 @@ noinst_PROGRAMS =				\
 
				 	perfmodels/regression_based		\
			
 
				 	perfmodels/non_linear_regression_based	\
			
 
				 	perfmodels/feed				\
			
 
				+	perfmodels/valid_model			\
			
 
				 	sched_policies/data_locality            \
			
 
				 	sched_policies/execute_all_tasks        \
			
 
				 	sched_policies/simple_deps              \
			
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -180,6 +180,30 @@ int main(int argc, char **argv)
 
				 		starpu_data_invalidate(v_handle);
			
 
				 	}
			
 
				 
			
 
				+	for (loop = 0; loop < NLOOPS; loop++)
			
 
				+	{
			
 
				+		struct starpu_task *memset_task;
			
 
				+		struct starpu_task *check_content_task;
			
 
				+
			
 
				+		memset_task = starpu_task_create();
			
 
				+		memset_task->cl = &memset_cl;
			
 
				+		memset_task->handles[0] = v_handle;
			
 
				+
			
 
				+		ret = starpu_task_submit(memset_task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		check_content_task = starpu_task_create();
			
 
				+		check_content_task->cl = &check_content_cl;
			
 
				+		check_content_task->handles[0] = v_handle;
			
 
				+
			
 
				+		ret = starpu_task_submit(check_content_task);
			
 
				+		if (ret == -ENODEV) goto enodev;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+
			
 
				+		starpu_data_invalidate_submit(v_handle);
			
 
				+	}
			
 
				+
			
 
				 	/* this should get rid of automatically allocated buffers */
			
 
				 	starpu_data_unregister(v_handle);
			
 
				 
			
--- a/tests/datawizard/lazy_unregister.c
+++ b/tests/datawizard/lazy_unregister.c
@@ -61,7 +61,7 @@ int main(void)
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 
			
 
				-	starpu_data_unregister_lazy(handle);
			
 
				+	starpu_data_unregister_submit(handle);
			
 
				 
			
 
				 	ret = starpu_task_submit(t1);
			
 
				 	if (ret == -ENODEV)
			
--- a/tests/loader.c
+++ b/tests/loader.c
@@ -130,7 +130,7 @@ static void decode(char **src, char *motif, char *value)
 
				 
			
 
				 	       to = strncpy(to, *src, strlen(*src)-strlen(y)); to += strlen(*src)-strlen(y);
			
 
				 	       to = strcpy(to, value); to += strlen(value);
			
 
				-	       to = stpcpy(to, y+strlen(motif));
			
 
				+	       strcpy(to, y+strlen(motif));
			
 
				 
			
 
				 	       *src = strdup(neo);
			
 
				 	       y = strstr(*src, motif);
			
@@ -165,6 +165,17 @@ int main(int argc, char *argv[])
 
				 		sprintf(test_args, "%s/examples/spmv/matrix_market/examples/fidapm05.mtx", STARPU_SRC_DIR);
			
 
				 	}
			
 
				 
			
 
				+	if (strstr(test_name, "starpu_perfmodel_display"))
			
 
				+	{
			
 
				+		test_args = (char *) malloc(5*sizeof(char));
			
 
				+		sprintf(test_args, "-l");
			
 
				+	}
			
 
				+	if (strstr(test_name, "starpu_perfmodel_plot"))
			
 
				+	{
			
 
				+		test_args = (char *) malloc(5*sizeof(char));
			
 
				+		sprintf(test_args, "-l");
			
 
				+	}
			
 
				+
			
 
				 	/* get launcher program */
			
 
				 	launcher=getenv("STARPU_CHECK_LAUNCHER");
			
 
				 	launcher_args=getenv("STARPU_CHECK_LAUNCHER_ARGS");
			
--- a/tests/main/insert_task_array.c
+++ b/tests/main/insert_task_array.c
@@ -0,0 +1,88 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+void func_cpu(void *descr[], void *_args)
			
 
				+{
			
 
				+	int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+	int factor;
			
 
				+
			
 
				+	starpu_codelet_unpack_args(_args, &factor);
			
 
				+
			
 
				+	STARPU_SKIP_IF_VALGRIND;
			
 
				+        *x0 = *x0 * factor;
			
 
				+        *x1 = *x1 * (float)factor;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.modes = { STARPU_RW, STARPU_RW },
			
 
				+	.cpu_funcs = {func_cpu, NULL},
			
 
				+        .nbuffers = 2
			
 
				+};
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+        int x; float f;
			
 
				+	int factor=12;
			
 
				+        int i, ret;
			
 
				+        starpu_data_handle_t data_handles[2];
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	x = 1;
			
 
				+	starpu_variable_data_register(&data_handles[0], 0, (uintptr_t)&x, sizeof(x));
			
 
				+	f = 2.0;
			
 
				+	starpu_variable_data_register(&data_handles[1], 0, (uintptr_t)&f, sizeof(f));
			
 
				+
			
 
				+        ret = starpu_insert_task(&mycodelet,
			
 
				+				 STARPU_DATA_ARRAY, data_handles, 2,
			
 
				+				 STARPU_VALUE, &factor, sizeof(factor),
			
 
				+				 STARPU_PRIORITY, 1,
			
 
				+				 0);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_insert_task");
			
 
				+
			
 
				+        ret = starpu_task_wait_for_all();
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all");
			
 
				+
			
 
				+enodev:
			
 
				+        for(i=0 ; i<2 ; i++)
			
 
				+	{
			
 
				+		starpu_data_unregister(data_handles[i]);
			
 
				+        }
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	if (ret == -ENODEV)
			
 
				+	{
			
 
				+		fprintf(stderr, "WARNING: No one can execute this task\n");
			
 
				+		/* yes, we do not perform the computation but we did detect that no one
			
 
				+		 * could perform the kernel, so this is not an error from StarPU */
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+	else {
			
 
				+		FPRINTF(stderr, "VALUES: %d %f\n", x, f);
			
 
				+		ret = !(x == 12 && f == 24.0);
			
 
				+		STARPU_RETURN(ret);
			
 
				+	}
			
 
				+}
			
--- a/tests/perfmodels/valid_model.c
+++ b/tests/perfmodels/valid_model.c
@@ -0,0 +1,132 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <config.h>
			
 
				+#include <starpu.h>
			
 
				+#include "../helper.h"
			
 
				+
			
 
				+static void func(void *descr[], void *arg)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel rb_model =
			
 
				+{
			
 
				+	.type = STARPU_REGRESSION_BASED,
			
 
				+	.symbol = "valid_model_regression_based"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel nlrb_model =
			
 
				+{
			
 
				+	.type = STARPU_NL_REGRESSION_BASED,
			
 
				+	.symbol = "valid_model_non_linear_regression_based"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_perfmodel hb_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "valid_model_history_based"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet codelet =
			
 
				+{
			
 
				+	.cuda_funcs = {func, NULL},
			
 
				+	.opencl_funcs = {func, NULL},
			
 
				+	.cpu_funcs = {func, NULL},
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_W}
			
 
				+};
			
 
				+
			
 
				+static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model)
			
 
				+{
			
 
				+	int nloops = 123;
			
 
				+	int loop;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	struct starpu_perfmodel lmodel;
			
 
				+	int ret;
			
 
				+	int old_nsamples, new_nsamples;
			
 
				+	struct starpu_conf conf;
			
 
				+	unsigned archid;
			
 
				+
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.sched_policy_name = "eager";
			
 
				+	conf.calibrate = 1;
			
 
				+
			
 
				+	ret = starpu_init(&conf);
			
 
				+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	codelet->model = model;
			
 
				+
			
 
				+	old_nsamples = 0;
			
 
				+	ret = starpu_load_history_debug(codelet->model->symbol, &lmodel);
			
 
				+	if (ret != 1)
			
 
				+		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				+			old_nsamples += lmodel.per_arch[archid][0].regression.nsample;
			
 
				+
			
 
				+        starpu_vector_data_register(&handle, -1, (uintptr_t)NULL, 100, sizeof(int));
			
 
				+	for (loop = 0; loop < nloops; loop++)
			
 
				+	{
			
 
				+		ret = starpu_insert_task(codelet, STARPU_W, handle, 0);
			
 
				+		if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+        starpu_data_unregister(handle);
			
 
				+	starpu_shutdown(); // To force dumping perf models on disk
			
 
				+
			
 
				+	ret = starpu_load_history_debug(codelet->model->symbol, &lmodel);
			
 
				+	if (ret == 1)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "The performance model could not be loaded\n");
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	new_nsamples = 0;
			
 
				+	for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				+		new_nsamples += lmodel.per_arch[archid][0].regression.nsample;
			
 
				+
			
 
				+	if (old_nsamples + nloops == new_nsamples)
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Sampling for <%s> OK %d + %d == %d\n", codelet->model->symbol, old_nsamples, nloops, new_nsamples);
			
 
				+		return EXIT_SUCCESS;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		FPRINTF(stderr, "Sampling for <%s> failed %d + %d != %d\n", codelet->model->symbol, old_nsamples, nloops, new_nsamples);
			
 
				+		return EXIT_FAILURE;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	/* Use a linear regression model */
			
 
				+	ret = submit(&codelet, &rb_model);
			
 
				+	if (ret) return ret;
			
 
				+
			
 
				+	/* Use a non-linear regression model */
			
 
				+	ret = submit(&codelet, &nlrb_model);
			
 
				+	if (ret) return ret;
			
 
				+
			
 
				+#warning history based model cannot be validated with regression.nsample
			
 
				+#if 0
			
 
				+	/* Use a history model */
			
 
				+	ret = submit(&codelet, &hb_model);
			
 
				+	if (ret) return ret;
			
 
				+#endif
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
--- a/tools/Makefile.am
+++ b/tools/Makefile.am
@@ -31,10 +31,36 @@ EXTRA_DIST =					\
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				-bin_PROGRAMS += starpu_calibrate_bus
			
 
				+#####################################
			
 
				+# What to install and what to check #
			
 
				+#####################################
			
 
				+
			
 
				+STARPU_TOOLS	=
			
 
				+TESTS		= $(STARPU_TOOLS)
			
 
				+
			
 
				+if STARPU_HAVE_WINDOWS
			
 
				+check_PROGRAMS	=	$(STARPU_TOOLS)
			
 
				+else
			
 
				+check_PROGRAMS	=	$(LOADER) $(STARPU_TOOLS)
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_HAVE_WINDOWS
			
 
				+## test loader program
			
 
				+LOADER			=	loader
			
 
				+loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+LOADER_BIN		=	$(abs_top_builddir)/tools/$(LOADER)
			
 
				+loader_SOURCES		=	../tests/loader.c
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN)
			
 
				+endif
			
 
				 
			
 
				 if STARPU_USE_FXT
			
 
				-bin_PROGRAMS += starpu_fxt_tool starpu_fxt_stats
			
 
				+bin_PROGRAMS += 			\
			
 
				+	starpu_fxt_tool			\
			
 
				+	starpu_fxt_stats
			
 
				+
			
 
				+STARPU_TOOLS += 			\
			
 
				+	starpu_fxt_tool			\
			
 
				+	starpu_fxt_stats
			
 
				 
			
 
				 starpu_fxt_tool_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS) $(FXT_CFLAGS)
			
 
				 starpu_fxt_tool_LDADD = $(FXT_LIBS)
			
@@ -43,14 +69,21 @@ starpu_fxt_stats_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS) $(FXT_CFLAGS)
 
				 starpu_fxt_stats_LDADD = $(FXT_LIBS)
			
 
				 endif
			
 
				 
			
 
				-bin_PROGRAMS += \
			
 
				+bin_PROGRAMS += 			\
			
 
				+	starpu_perfmodel_display	\
			
 
				+	starpu_perfmodel_plot 		\
			
 
				+	starpu_machine_display		\
			
 
				+	starpu_calibrate_bus
			
 
				+
			
 
				+STARPU_TOOLS	+=			\
			
 
				 	starpu_perfmodel_display	\
			
 
				 	starpu_perfmodel_plot 		\
			
 
				-	starpu_machine_display
			
 
				+	starpu_machine_display		\
			
 
				+	starpu_calibrate_bus
			
 
				 
			
 
				 noinst_PROGRAMS =	cbc2paje lp2paje
			
 
				 
			
 
				-dist_bin_SCRIPTS +=	\
			
 
				+dist_bin_SCRIPTS +=			\
			
 
				 	starpu_workers_activity
			
 
				 
			
 
				 
			
@@ -79,4 +112,4 @@ dist_man1_MANS =\
 
				 endif
			
 
				 
			
 
				 showcheck:
			
 
				-	-cat /dev/null
			
 
				+	-cat $(TEST_LOGS) /dev/null
			
--- a/tools/starpu_calibrate_bus.c
+++ b/tools/starpu_calibrate_bus.c
@@ -71,18 +71,14 @@ static void parse_args(int argc, char **argv)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-#ifdef __MINGW32__
			
 
				-	WSADATA wsadata;
			
 
				-	WSAStartup(MAKEWORD(1,0), &wsadata);
			
 
				-#endif
			
 
				+	struct starpu_conf conf;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	if (starpu_init(NULL) == -ENODEV)
			
 
				-		return 0; /* Nothing to calibrate, so this is a success :) */
			
 
				-
			
 
				-	starpu_force_bus_sampling();
			
 
				+	starpu_conf_init(&conf);
			
 
				+	conf.bus_calibrate = 1;
			
 
				 
			
 
				+	starpu_init(&conf);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/tools/starpu_machine_display.c
+++ b/tools/starpu_machine_display.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -96,7 +96,7 @@ Usage: %s [OPTION]                                            \n	\
 
				 Options:                                                      \n	\
			
 
				 	-h, --help       display this help and exit           \n	\
			
 
				 	-v, --version    output version information and exit  \n	\
			
 
				-        -f, --force      force bus sampling                   \n	\
			
 
				+	-f, --force      force bus sampling and show measures \n	\
			
 
				                                                               \n	\
			
 
				 Report bugs to <" PACKAGE_BUGREPORT ">.\n",
			
 
				 PROGNAME);
			
@@ -119,17 +119,17 @@ PROGNAME);
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int force = 0;
			
 
				+	struct starpu_conf conf;
			
 
				 
			
 
				 	parse_args(argc, argv, &force);
			
 
				 
			
 
				+	starpu_conf_init(&conf);
			
 
				+	if (force)
			
 
				+		conf.bus_calibrate = 1;
			
 
				+
			
 
				 	/* Even if starpu_init returns -ENODEV, we should go on : we will just
			
 
				 	 * print that we found no device. */
			
 
				-	(void) starpu_init(NULL);
			
 
				-
			
 
				-	if (force)
			
 
				-	{
			
 
				-		starpu_force_bus_sampling();
			
 
				-	}
			
 
				+	(void) starpu_init(&conf);
			
 
				 
			
 
				 	unsigned ncpu = starpu_cpu_worker_get_count();
			
 
				 	unsigned ncuda = starpu_cuda_worker_get_count();
			
@@ -151,9 +151,6 @@ int main(int argc, char **argv)
 
				 	fprintf(stdout, "\nbandwidth ...\n");
			
 
				 	starpu_bus_print_bandwidth(stdout);
			
 
				 
			
 
				-	fprintf(stdout, "\naffinity ...\n");
			
 
				-	starpu_bus_print_affinity(stdout);
			
 
				-
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/tools/starpu_perfmodel_display.c
+++ b/tools/starpu_perfmodel_display.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011  Université de Bordeaux 1
			
 
				- * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,8 +31,6 @@
 
				 
			
 
				 #define PROGNAME "starpu_perfmodel_display"
			
 
				 
			
 
				-static struct starpu_perfmodel model;
			
 
				-
			
 
				 /* display all available models */
			
 
				 static int list = 0;
			
 
				 /* what kernel ? */
			
@@ -60,6 +58,7 @@ static void usage(char **argv)
 
				 	fprintf(stderr, "   -h, --help          display this help and exit\n");
			
 
				 	fprintf(stderr, "   -v, --version       output version information and exit\n\n");
			
 
				         fprintf(stderr, "Reports bugs to <"PACKAGE_BUGREPORT">.");
			
 
				+        fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 static void parse_args(int argc, char **argv)
			
@@ -80,8 +79,10 @@ static void parse_args(int argc, char **argv)
 
				 	};
			
 
				 
			
 
				 	int option_index;
			
 
				-	while ((c = getopt_long(argc, argv, "ls:p:a:f:h", long_options, &option_index)) != -1) {
			
 
				-	switch (c) {
			
 
				+	while ((c = getopt_long(argc, argv, "ls:p:a:f:h", long_options, &option_index)) != -1)
			
 
				+	{
			
 
				+		switch (c)
			
 
				+		{
			
 
				                 case 'l':
			
 
				                         /* list all models */
			
 
				                         list = 1;
			
@@ -121,7 +122,7 @@ static void parse_args(int argc, char **argv)
 
				 		case '?':
			
 
				 		default:
			
 
				 			fprintf(stderr, "Unrecognized option: -%c\n", optopt);
			
 
				-	}
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	if (!symbol && !list)
			
@@ -141,23 +142,27 @@ static void display_history_based_perf_model(struct starpu_per_arch_perfmodel *p
 
				 	if (!parameter && ptr)
			
 
				 		fprintf(stderr, "# hash\t\tsize\t\tmean\t\tdev\t\tn\n");
			
 
				 
			
 
				-	while (ptr) {
			
 
				+	while (ptr)
			
 
				+	{
			
 
				 		struct starpu_history_entry *entry = ptr->entry;
			
 
				 		if (!display_specific_footprint || (entry->footprint == specific_footprint))
			
 
				 		{
			
 
				 			if (!parameter)
			
 
				-			{	
			
 
				+			{
			
 
				 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
			
 
				 				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
			
 
				 					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
			
 
				 			}
			
 
				-			else {
			
 
				+			else
			
 
				+			{
			
 
				 				/* only display the parameter that was specifically requested */
			
 
				-				if (strcmp(parameter, "mean") == 0) {
			
 
				+				if (strcmp(parameter, "mean") == 0)
			
 
				+				{
			
 
				 					printf("%-15le\n", entry->mean);
			
 
				 				}
			
 
				-		
			
 
				-				if (strcmp(parameter, "stddev") == 0) {
			
 
				+
			
 
				+				if (strcmp(parameter, "stddev") == 0)
			
 
				+				{
			
 
				 					printf("%-15le\n", entry->deviation);
			
 
				 					return;
			
 
				 				}
			
@@ -173,8 +178,8 @@ static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_
 
				 	struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][nimpl];
			
 
				 	char archname[32];
			
 
				 
			
 
				-	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list) {
			
 
				-
			
 
				+	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
			
 
				+	{
			
 
				 		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
			
 
				 		fprintf(stderr, "performance model for %s\n", archname);
			
 
				 	}
			
@@ -183,8 +188,9 @@ static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_
 
				 	{
			
 
				 		/* no specific parameter was requested, so we display everything */
			
 
				 		if (arch_model->regression.nsample)
			
 
				-			fprintf(stderr, "\tRegression : #sample = %d\n",
			
 
				-				arch_model->regression.nsample);
			
 
				+		{
			
 
				+			fprintf(stderr, "\tRegression : #sample = %d\n", arch_model->regression.nsample);
			
 
				+		}
			
 
				 
			
 
				 		/* Only display the regression model if we could actually build a model */
			
 
				 		if (arch_model->regression.valid)
			
@@ -193,10 +199,11 @@ static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_
 
				 			fprintf(stderr, "\t\talpha = %e\n", arch_model->regression.alpha);
			
 
				 			fprintf(stderr, "\t\tbeta = %e\n", arch_model->regression.beta);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			//fprintf(stderr, "\tLinear model is INVALID\n");
			
 
				 		}
			
 
				-	
			
 
				+
			
 
				 		if (arch_model->regression.nl_valid)
			
 
				 		{
			
 
				 			fprintf(stderr, "\tNon-Linear: y = a size ^b + c\n");
			
@@ -204,7 +211,8 @@ static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_
 
				 			fprintf(stderr, "\t\tb = %e\n", arch_model->regression.b);
			
 
				 			fprintf(stderr, "\t\tc = %e\n", arch_model->regression.c);
			
 
				 		}
			
 
				-		else {
			
 
				+		else
			
 
				+		{
			
 
				 			//fprintf(stderr, "\tNon-Linear model is INVALID\n");
			
 
				 		}
			
 
				 
			
@@ -216,41 +224,49 @@ static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_
 
				 		printf("\t debug file path : %s\n", debugname);
			
 
				 #endif
			
 
				 	}
			
 
				-	else {
			
 
				+	else
			
 
				+	{
			
 
				 		/* only display the parameter that was specifically requested */
			
 
				-		if (strcmp(parameter, "a") == 0) {
			
 
				+		if (strcmp(parameter, "a") == 0)
			
 
				+		{
			
 
				 			printf("%e\n", arch_model->regression.a);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(parameter, "b") == 0) {
			
 
				+		if (strcmp(parameter, "b") == 0)
			
 
				+		{
			
 
				 			printf("%e\n", arch_model->regression.b);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(parameter, "c") == 0) {
			
 
				+		if (strcmp(parameter, "c") == 0)
			
 
				+		{
			
 
				 			printf("%e\n", arch_model->regression.c);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(parameter, "alpha") == 0) {
			
 
				+		if (strcmp(parameter, "alpha") == 0)
			
 
				+		{
			
 
				 			printf("%e\n", arch_model->regression.alpha);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(parameter, "beta") == 0) {
			
 
				+		if (strcmp(parameter, "beta") == 0)
			
 
				+		{
			
 
				 			printf("%e\n", arch_model->regression.beta);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(parameter, "path-file-debug") == 0) {
			
 
				+		if (strcmp(parameter, "path-file-debug") == 0)
			
 
				+		{
			
 
				 			char debugname[256];
			
 
				 			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
			
 
				 			printf("%s\n", debugname);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if ((strcmp(parameter, "mean") == 0) || (strcmp(parameter, "stddev"))) {
			
 
				+		if ((strcmp(parameter, "mean") == 0) || (strcmp(parameter, "stddev")))
			
 
				+		{
			
 
				 			display_history_based_perf_model(arch_model);
			
 
				 			return;
			
 
				 		}
			
@@ -269,14 +285,18 @@ static void display_all_perf_models(struct starpu_perfmodel *model)
 
				 		/* display all architectures */
			
 
				 		unsigned archid;
			
 
				 		unsigned implid;
			
 
				-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++) {
			
 
				-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) { /* Display all codelets on each arch */
			
 
				+		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
			
 
				+		{
			
 
				+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				+			{ /* Display all codelets on each arch */
			
 
				 				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	else {
			
 
				-		if (strcmp(arch, "cpu") == 0) {
			
 
				+	else
			
 
				+	{
			
 
				+		if (strcmp(arch, "cpu") == 0)
			
 
				+		{
			
 
				 			unsigned implid;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
 
				 				display_perf_model(model, STARPU_CPU_DEFAULT,implid); /* Display all codelets on cpu */
			
@@ -299,11 +319,14 @@ static void display_all_perf_models(struct starpu_perfmodel *model)
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(arch, "cuda") == 0) {
			
 
				+		if (strcmp(arch, "cuda") == 0)
			
 
				+		{
			
 
				 			unsigned archid;
			
 
				 			unsigned implid;
			
 
				-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++) {
			
 
				-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++) {
			
 
				+			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
			
 
				+			{
			
 
				+				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
			
 
				+				{
			
 
				 					char archname[32];
			
 
				 					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
			
 
				 					fprintf(stderr, "performance model for %s\n", archname);
			
@@ -326,7 +349,8 @@ static void display_all_perf_models(struct starpu_perfmodel *model)
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(arch, "gordon") == 0) {
			
 
				+		if (strcmp(arch, "gordon") == 0)
			
 
				+		{
			
 
				 			fprintf(stderr, "performance model for gordon\n");
			
 
				 			unsigned implid;
			
 
				 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
			
@@ -341,8 +365,6 @@ static void display_all_perf_models(struct starpu_perfmodel *model)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-//	assert(argc == 2);
			
 
				-
			
 
				 #ifdef __MINGW32__
			
 
				 	WSADATA wsadata;
			
 
				 	WSAStartup(MAKEWORD(1,0), &wsadata);
			
@@ -350,21 +372,24 @@ int main(int argc, char **argv)
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-        if (list) {
			
 
				+        if (list)
			
 
				+	{
			
 
				                 int ret = starpu_list_models(stdout);
			
 
				-                if (ret) {
			
 
				+                if (ret)
			
 
				+		{
			
 
				                         fprintf(stderr, "The performance model directory is invalid\n");
			
 
				                         return 1;
			
 
				                 }
			
 
				         }
			
 
				-        else {
			
 
				+        else
			
 
				+	{
			
 
				+		struct starpu_perfmodel model;
			
 
				                 int ret = starpu_load_history_debug(symbol, &model);
			
 
				                 if (ret == 1)
			
 
				-                        {
			
 
				-                                fprintf(stderr, "The performance model could not be loaded\n");
			
 
				-                                return 1;
			
 
				-                        }
			
 
				-
			
 
				+		{
			
 
				+			fprintf(stderr, "The performance model could not be loaded\n");
			
 
				+			return 1;
			
 
				+		}
			
 
				                 display_all_perf_models(&model);
			
 
				         }
			
 
				 
			
--- a/tools/starpu_perfmodel_plot.c
+++ b/tools/starpu_perfmodel_plot.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2011  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -74,6 +74,7 @@ given perfmodel\n");
 
				 	fprintf(stderr, "   -h, --help          display this help and exit\n");
			
 
				 	fprintf(stderr, "   -v, --version       output version information and exit\n\n");
			
 
				         fprintf(stderr, "Report bugs to <%s>.", PACKAGE_BUGREPORT);
			
 
				+        fprintf(stderr, "\n");
			
 
				 }
			
 
				 
			
 
				 static void parse_args(int argc, char **argv)
			
@@ -147,6 +148,14 @@ static void parse_args(int argc, char **argv)
 
				 			continue;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+	if (!symbol && !list)
			
 
				+	{
			
 
				+		fprintf(stderr, "Incorrect usage, aborting\n");
			
 
				+                usage(argv);
			
 
				+		exit(-1);
			
 
				+	}
			
 
				+
			
 
				 }
			
 
				 
			
 
				 static void print_comma(FILE *gnuplot_file, int *first)
			
@@ -413,13 +422,6 @@ int main(int argc, char **argv)
 
				 		return 0;
			
 
				         }
			
 
				 
			
 
				-	/* We need at least a symbol name */
			
 
				-	if (!symbol)
			
 
				-	{
			
 
				-		fprintf(stderr, "No symbol was specified\n");
			
 
				-		return 1;
			
 
				-	}
			
 
				-
			
 
				 	/* Load the performance model associated to the symbol */
			
 
				 	ret = starpu_load_history_debug(symbol, &model);
			
 
				 	if (ret == 1)